375 lines
11 KiB
Diff
375 lines
11 KiB
Diff
|
From 8706bf3e3cba8c708f9933f0d1c6a23f9c2c8c33 Mon Sep 17 00:00:00 2001
|
||
|
From: Eric Biggers <ebiggers@google.com>
|
||
|
Date: Sun, 13 Oct 2024 21:06:49 -0700
|
||
|
Subject: crypto: x86/crc32c - eliminate jump table and excessive unrolling
|
||
|
|
||
|
crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully
|
||
|
unrolled and uses a jump table to jump into the correct location. This
|
||
|
optimization is misguided, as it bloats the binary code size and
|
||
|
introduces an indirect call. x86_64 CPUs can predict loops well, so it
|
||
|
is fine to just use a loop instead. Loop bookkeeping instructions can
|
||
|
compete with the crc instructions for the ALUs, but this is easily
|
||
|
mitigated by unrolling the loop by a smaller amount, such as 4 times.
|
||
|
|
||
|
Therefore, re-roll the loop and make related tweaks to the code.
|
||
|
|
||
|
This reduces the binary code size of crc_pclmul() from 4546 bytes to 418
|
||
|
bytes, a 91% reduction. In general it also makes the code faster, with
|
||
|
some large improvements seen when retpoline is enabled.
|
||
|
|
||
|
More detailed performance results are shown below. They are given as
|
||
|
percent improvement in throughput (negative means regressed) for CPU
|
||
|
microarchitecture vs. input length in bytes. E.g. an improvement from
|
||
|
40 GB/s to 50 GB/s would be listed as 25%.
|
||
|
|
||
|
Table 1: Results with retpoline enabled (the default):
|
||
|
|
||
|
| 512 | 833 | 1024 | 2000 | 3173 | 4096 |
|
||
|
---------------------+-------+-------+-------+------ +-------+-------+
|
||
|
Intel Haswell | 35.0% | 20.7% | 17.8% | 9.7% | -0.2% | 4.4% |
|
||
|
Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% | 0.0% | 5.4% |
|
||
|
AMD Zen 2 | 29.5% | 17.2% | 13.5% | 8.6% | -0.5% | 2.8% |
|
||
|
|
||
|
Table 2: Results with retpoline disabled:
|
||
|
|
||
|
| 512 | 833 | 1024 | 2000 | 3173 | 4096 |
|
||
|
---------------------+-------+-------+-------+------ +-------+-------+
|
||
|
Intel Haswell | 3.3% | 4.8% | 4.5% | 0.9% | -2.9% | 0.3% |
|
||
|
Intel Emerald Rapids | 7.5% | 6.4% | 5.2% | 2.3% | -0.0% | 0.6% |
|
||
|
AMD Zen 2 | 11.8% | 1.4% | 0.2% | 1.3% | -0.9% | -0.2% |
|
||
|
|
||
|
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
||
|
---
|
||
|
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 +++++++++-------------
|
||
|
1 file changed, 92 insertions(+), 141 deletions(-)
|
||
|
|
||
|
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||
|
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||
|
@@ -7,6 +7,7 @@
|
||
|
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
|
||
|
*
|
||
|
* Copyright (C) 2012 Intel Corporation.
|
||
|
+ * Copyright 2024 Google LLC
|
||
|
*
|
||
|
* Authors:
|
||
|
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||
|
@@ -44,18 +45,9 @@
|
||
|
*/
|
||
|
|
||
|
#include <linux/linkage.h>
|
||
|
-#include <asm/nospec-branch.h>
|
||
|
|
||
|
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
|
||
|
|
||
|
-.macro LABEL prefix n
|
||
|
-.L\prefix\n\():
|
||
|
-.endm
|
||
|
-
|
||
|
-.macro JMPTBL_ENTRY i
|
||
|
-.quad .Lcrc_\i
|
||
|
-.endm
|
||
|
-
|
||
|
# Define threshold below which buffers are considered "small" and routed to
|
||
|
# regular CRC code that does not interleave the CRC instructions.
|
||
|
#define SMALL_SIZE 200
|
||
|
@@ -64,139 +56,116 @@
|
||
|
|
||
|
.text
|
||
|
SYM_FUNC_START(crc_pcl)
|
||
|
-#define bufp rdi
|
||
|
-#define bufp_dw %edi
|
||
|
-#define bufp_w %di
|
||
|
-#define bufp_b %dil
|
||
|
-#define bufptmp %rcx
|
||
|
-#define block_0 %rcx
|
||
|
-#define block_1 %rdx
|
||
|
-#define block_2 %r11
|
||
|
-#define len %esi
|
||
|
-#define crc_init_arg %edx
|
||
|
-#define tmp %rbx
|
||
|
-#define crc_init %r8d
|
||
|
-#define crc_init_q %r8
|
||
|
-#define crc1 %r9
|
||
|
-#define crc2 %r10
|
||
|
-
|
||
|
- pushq %rbx
|
||
|
- pushq %rdi
|
||
|
- pushq %rsi
|
||
|
-
|
||
|
- ## Move crc_init for Linux to a different
|
||
|
- mov crc_init_arg, crc_init
|
||
|
+#define bufp %rdi
|
||
|
+#define bufp_d %edi
|
||
|
+#define len %esi
|
||
|
+#define crc_init %edx
|
||
|
+#define crc_init_q %rdx
|
||
|
+#define n_misaligned %ecx /* overlaps chunk_bytes! */
|
||
|
+#define n_misaligned_q %rcx
|
||
|
+#define chunk_bytes %ecx /* overlaps n_misaligned! */
|
||
|
+#define chunk_bytes_q %rcx
|
||
|
+#define crc1 %r8
|
||
|
+#define crc2 %r9
|
||
|
|
||
|
- mov %bufp, bufptmp # rdi = *buf
|
||
|
cmp $SMALL_SIZE, len
|
||
|
jb .Lsmall
|
||
|
|
||
|
################################################################
|
||
|
## 1) ALIGN:
|
||
|
################################################################
|
||
|
- neg %bufp
|
||
|
- and $7, %bufp # calculate the unalignment amount of
|
||
|
+ mov bufp_d, n_misaligned
|
||
|
+ neg n_misaligned
|
||
|
+ and $7, n_misaligned # calculate the misalignment amount of
|
||
|
# the address
|
||
|
- je .Lproc_block # Skip if aligned
|
||
|
+ je .Laligned # Skip if aligned
|
||
|
|
||
|
+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
|
||
|
+ # the remaining data to an 8-byte boundary.
|
||
|
.Ldo_align:
|
||
|
- #### Calculate CRC of unaligned bytes of the buffer (if any)
|
||
|
- movq (bufptmp), tmp # load a quadward from the buffer
|
||
|
- add %bufp, bufptmp # align buffer pointer for quadword
|
||
|
- # processing
|
||
|
- sub bufp_dw, len # update buffer length
|
||
|
+ movq (bufp), %rax
|
||
|
+ add n_misaligned_q, bufp
|
||
|
+ sub n_misaligned, len
|
||
|
.Lalign_loop:
|
||
|
- crc32b %bl, crc_init # compute crc32 of 1-byte
|
||
|
- shr $8, tmp # get next byte
|
||
|
- dec %bufp
|
||
|
+ crc32b %al, crc_init # compute crc32 of 1-byte
|
||
|
+ shr $8, %rax # get next byte
|
||
|
+ dec n_misaligned
|
||
|
jne .Lalign_loop
|
||
|
-
|
||
|
-.Lproc_block:
|
||
|
+.Laligned:
|
||
|
|
||
|
################################################################
|
||
|
- ## 2) PROCESS BLOCKS:
|
||
|
+ ## 2) PROCESS BLOCK:
|
||
|
################################################################
|
||
|
|
||
|
- ## compute num of bytes to be processed
|
||
|
-
|
||
|
cmp $128*24, len
|
||
|
jae .Lfull_block
|
||
|
|
||
|
-.Lcontinue_block:
|
||
|
- ## len < 128*24
|
||
|
- movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||
|
- mul len
|
||
|
- shrq $16, %rax
|
||
|
-
|
||
|
- ## eax contains floor(bytes / 24) = num 24-byte chunks to do
|
||
|
-
|
||
|
- ## process rax 24-byte chunks (128 >= rax >= 0)
|
||
|
-
|
||
|
- ## compute end address of each block
|
||
|
- ## block 0 (base addr + RAX * 8)
|
||
|
- ## block 1 (base addr + RAX * 16)
|
||
|
- ## block 2 (base addr + RAX * 24)
|
||
|
- lea (bufptmp, %rax, 8), block_0
|
||
|
- lea (block_0, %rax, 8), block_1
|
||
|
- lea (block_1, %rax, 8), block_2
|
||
|
-
|
||
|
- xor crc1, crc1
|
||
|
- xor crc2, crc2
|
||
|
-
|
||
|
- ## branch into array
|
||
|
- leaq jump_table(%rip), %bufp
|
||
|
- mov (%bufp,%rax,8), %bufp
|
||
|
- JMP_NOSPEC bufp
|
||
|
+.Lpartial_block:
|
||
|
+ # Compute floor(len / 24) to get num qwords to process from each lane.
|
||
|
+ imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
|
||
|
+ shr $16, %eax
|
||
|
+ jmp .Lcrc_3lanes
|
||
|
|
||
|
- ################################################################
|
||
|
- ## 2a) PROCESS FULL BLOCKS:
|
||
|
- ################################################################
|
||
|
.Lfull_block:
|
||
|
- movl $128,%eax
|
||
|
- lea 128*8*2(block_0), block_1
|
||
|
- lea 128*8*3(block_0), block_2
|
||
|
- add $128*8*1, block_0
|
||
|
-
|
||
|
- xor crc1,crc1
|
||
|
- xor crc2,crc2
|
||
|
-
|
||
|
- # Fall through into top of crc array (crc_128)
|
||
|
+ # Processing 128 qwords from each lane.
|
||
|
+ mov $128, %eax
|
||
|
|
||
|
################################################################
|
||
|
- ## 3) CRC Array:
|
||
|
+ ## 3) CRC each of three lanes:
|
||
|
################################################################
|
||
|
|
||
|
- i=128
|
||
|
-.rept 128-1
|
||
|
-.altmacro
|
||
|
-LABEL crc_ %i
|
||
|
-.noaltmacro
|
||
|
- ENDBR
|
||
|
- crc32q -i*8(block_0), crc_init_q
|
||
|
- crc32q -i*8(block_1), crc1
|
||
|
- crc32q -i*8(block_2), crc2
|
||
|
- i=(i-1)
|
||
|
-.endr
|
||
|
-
|
||
|
-.altmacro
|
||
|
-LABEL crc_ %i
|
||
|
-.noaltmacro
|
||
|
- ENDBR
|
||
|
- crc32q -i*8(block_0), crc_init_q
|
||
|
- crc32q -i*8(block_1), crc1
|
||
|
-# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
|
||
|
+.Lcrc_3lanes:
|
||
|
+ xor crc1,crc1
|
||
|
+ xor crc2,crc2
|
||
|
+ mov %eax, chunk_bytes
|
||
|
+ shl $3, chunk_bytes # num bytes to process from each lane
|
||
|
+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter
|
||
|
+ jl .Lcrc_3lanes_4x_done
|
||
|
+
|
||
|
+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
|
||
|
+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
|
||
|
+.Lcrc_3lanes_4x_loop:
|
||
|
+ crc32q (bufp), crc_init_q
|
||
|
+ crc32q (bufp,chunk_bytes_q), crc1
|
||
|
+ crc32q (bufp,chunk_bytes_q,2), crc2
|
||
|
+ crc32q 8(bufp), crc_init_q
|
||
|
+ crc32q 8(bufp,chunk_bytes_q), crc1
|
||
|
+ crc32q 8(bufp,chunk_bytes_q,2), crc2
|
||
|
+ crc32q 16(bufp), crc_init_q
|
||
|
+ crc32q 16(bufp,chunk_bytes_q), crc1
|
||
|
+ crc32q 16(bufp,chunk_bytes_q,2), crc2
|
||
|
+ crc32q 24(bufp), crc_init_q
|
||
|
+ crc32q 24(bufp,chunk_bytes_q), crc1
|
||
|
+ crc32q 24(bufp,chunk_bytes_q,2), crc2
|
||
|
+ add $32, bufp
|
||
|
+ sub $4, %eax
|
||
|
+ jge .Lcrc_3lanes_4x_loop
|
||
|
+
|
||
|
+.Lcrc_3lanes_4x_done:
|
||
|
+ add $4, %eax
|
||
|
+ jz .Lcrc_3lanes_last_qword
|
||
|
+
|
||
|
+.Lcrc_3lanes_1x_loop:
|
||
|
+ crc32q (bufp), crc_init_q
|
||
|
+ crc32q (bufp,chunk_bytes_q), crc1
|
||
|
+ crc32q (bufp,chunk_bytes_q,2), crc2
|
||
|
+ add $8, bufp
|
||
|
+ dec %eax
|
||
|
+ jnz .Lcrc_3lanes_1x_loop
|
||
|
|
||
|
- mov block_2, block_0
|
||
|
+.Lcrc_3lanes_last_qword:
|
||
|
+ crc32q (bufp), crc_init_q
|
||
|
+ crc32q (bufp,chunk_bytes_q), crc1
|
||
|
+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
|
||
|
|
||
|
################################################################
|
||
|
## 4) Combine three results:
|
||
|
################################################################
|
||
|
|
||
|
- lea (K_table-8)(%rip), %bufp # first entry is for idx 1
|
||
|
- shlq $3, %rax # rax *= 8
|
||
|
- pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
|
||
|
- leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
|
||
|
- sub %eax, len # len -= rax*24
|
||
|
+ lea (K_table-8)(%rip), %rax # first entry is for idx 1
|
||
|
+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
|
||
|
+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
|
||
|
+ sub %eax, len # len -= chunk_bytes * 3
|
||
|
|
||
|
movq crc_init_q, %xmm1 # CRC for block 1
|
||
|
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
|
||
|
@@ -206,20 +175,19 @@ LABEL crc_ %i
|
||
|
|
||
|
pxor %xmm2,%xmm1
|
||
|
movq %xmm1, %rax
|
||
|
- xor -i*8(block_2), %rax
|
||
|
+ xor (bufp,chunk_bytes_q,2), %rax
|
||
|
mov crc2, crc_init_q
|
||
|
crc32 %rax, crc_init_q
|
||
|
+ lea 8(bufp,chunk_bytes_q,2), bufp
|
||
|
|
||
|
################################################################
|
||
|
- ## 5) Check for end:
|
||
|
+ ## 5) If more blocks remain, goto (2):
|
||
|
################################################################
|
||
|
|
||
|
-LABEL crc_ 0
|
||
|
- ENDBR
|
||
|
cmp $128*24, len
|
||
|
- jae .Lfull_block
|
||
|
+ jae .Lfull_block
|
||
|
cmp $SMALL_SIZE, len
|
||
|
- jae .Lcontinue_block
|
||
|
+ jae .Lpartial_block
|
||
|
|
||
|
#######################################################################
|
||
|
## 6) Process any remainder without interleaving:
|
||
|
@@ -231,47 +199,30 @@ LABEL crc_ 0
|
||
|
shr $3, %eax
|
||
|
jz .Ldo_dword
|
||
|
.Ldo_qwords:
|
||
|
- crc32q (bufptmp), crc_init_q
|
||
|
- add $8, bufptmp
|
||
|
+ crc32q (bufp), crc_init_q
|
||
|
+ add $8, bufp
|
||
|
dec %eax
|
||
|
jnz .Ldo_qwords
|
||
|
.Ldo_dword:
|
||
|
test $4, len
|
||
|
jz .Ldo_word
|
||
|
- crc32l (bufptmp), crc_init
|
||
|
- add $4, bufptmp
|
||
|
+ crc32l (bufp), crc_init
|
||
|
+ add $4, bufp
|
||
|
.Ldo_word:
|
||
|
test $2, len
|
||
|
jz .Ldo_byte
|
||
|
- crc32w (bufptmp), crc_init
|
||
|
- add $2, bufptmp
|
||
|
+ crc32w (bufp), crc_init
|
||
|
+ add $2, bufp
|
||
|
.Ldo_byte:
|
||
|
test $1, len
|
||
|
jz .Ldone
|
||
|
- crc32b (bufptmp), crc_init
|
||
|
+ crc32b (bufp), crc_init
|
||
|
.Ldone:
|
||
|
mov crc_init, %eax
|
||
|
- popq %rsi
|
||
|
- popq %rdi
|
||
|
- popq %rbx
|
||
|
RET
|
||
|
SYM_FUNC_END(crc_pcl)
|
||
|
|
||
|
.section .rodata, "a", @progbits
|
||
|
- ################################################################
|
||
|
- ## jump table Table is 129 entries x 2 bytes each
|
||
|
- ################################################################
|
||
|
-.align 4
|
||
|
-jump_table:
|
||
|
- i=0
|
||
|
-.rept 129
|
||
|
-.altmacro
|
||
|
-JMPTBL_ENTRY %i
|
||
|
-.noaltmacro
|
||
|
- i=i+1
|
||
|
-.endr
|
||
|
-
|
||
|
-
|
||
|
################################################################
|
||
|
## PCLMULQDQ tables
|
||
|
## Table is 128 entries x 2 words (8 bytes) each
|