refresh patches
This commit is contained in:
@@ -0,0 +1,65 @@
|
||||
From 594316efc465f1408482e0d1dd379f4e3a6a5c7c Mon Sep 17 00:00:00 2001
|
||||
From: Eric Biggers <ebiggers@google.com>
|
||||
Date: Mon, 27 Jan 2025 13:16:09 -0800
|
||||
Subject: crypto: x86/aes-xts - make the fast path 64-bit specific
|
||||
|
||||
Remove 32-bit support from the fast path in xts_crypt(). Then optimize
|
||||
it for 64-bit, and simplify the code, by switching to sg_virt() and
|
||||
removing the now-unnecessary checks for crossing a page boundary.
|
||||
|
||||
The result is simpler code that is slightly smaller and faster in the
|
||||
case that actually matters (64-bit).
|
||||
|
||||
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
||||
---
|
||||
arch/x86/crypto/aesni-intel_glue.c | 30 ++++++++++--------------------
|
||||
1 file changed, 10 insertions(+), 20 deletions(-)
|
||||
|
||||
--- a/arch/x86/crypto/aesni-intel_glue.c
|
||||
+++ b/arch/x86/crypto/aesni-intel_glue.c
|
||||
@@ -581,11 +581,8 @@ xts_crypt(struct skcipher_request *req,
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
|
||||
- const unsigned int cryptlen = req->cryptlen;
|
||||
- struct scatterlist *src = req->src;
|
||||
- struct scatterlist *dst = req->dst;
|
||||
|
||||
- if (unlikely(cryptlen < AES_BLOCK_SIZE))
|
||||
+ if (unlikely(req->cryptlen < AES_BLOCK_SIZE))
|
||||
return -EINVAL;
|
||||
|
||||
kernel_fpu_begin();
|
||||
@@ -593,23 +590,16 @@ xts_crypt(struct skcipher_request *req,
|
||||
|
||||
/*
|
||||
* In practice, virtually all XTS plaintexts and ciphertexts are either
|
||||
- * 512 or 4096 bytes, aligned such that they don't span page boundaries.
|
||||
- * To optimize the performance of these cases, and also any other case
|
||||
- * where no page boundary is spanned, the below fast-path handles
|
||||
- * single-page sources and destinations as efficiently as possible.
|
||||
+ * 512 or 4096 bytes and do not use multiple scatterlist elements. To
|
||||
+ * optimize the performance of these cases, the below fast-path handles
|
||||
+ * single-scatterlist-element messages as efficiently as possible. The
|
||||
+ * code is 64-bit specific, as it assumes no page mapping is needed.
|
||||
*/
|
||||
- if (likely(src->length >= cryptlen && dst->length >= cryptlen &&
|
||||
- src->offset + cryptlen <= PAGE_SIZE &&
|
||||
- dst->offset + cryptlen <= PAGE_SIZE)) {
|
||||
- struct page *src_page = sg_page(src);
|
||||
- struct page *dst_page = sg_page(dst);
|
||||
- void *src_virt = kmap_local_page(src_page) + src->offset;
|
||||
- void *dst_virt = kmap_local_page(dst_page) + dst->offset;
|
||||
-
|
||||
- (*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen,
|
||||
- req->iv);
|
||||
- kunmap_local(dst_virt);
|
||||
- kunmap_local(src_virt);
|
||||
+ if (IS_ENABLED(CONFIG_X86_64) &&
|
||||
+ likely(req->src->length >= req->cryptlen &&
|
||||
+ req->dst->length >= req->cryptlen)) {
|
||||
+ (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src),
|
||||
+ sg_virt(req->dst), req->cryptlen, req->iv);
|
||||
kernel_fpu_end();
|
||||
return 0;
|
||||
}
|
@@ -1,181 +0,0 @@
|
||||
From 0a957679a29a06fb2e3971615ff9f05f6becb941 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Biggers <ebiggers@google.com>
|
||||
Date: Sun, 13 Oct 2024 21:06:49 -0700
|
||||
Subject: crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
|
||||
|
||||
The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for
|
||||
lengths >= 512, due to the overhead of saving and restoring FPU state.
|
||||
Therefore, it is unnecessary for this code to be excessively "optimized"
|
||||
for lengths < 200. Eliminate the excessive unrolling of this part of
|
||||
the code and use a more straightforward qword-at-a-time loop.
|
||||
|
||||
Note: the part of the code in question is not entirely redundant, as it
|
||||
is still used to process any remainder mod 24, as well as any remaining
|
||||
data when fewer than 200 bytes remain after least one 3072-byte chunk.
|
||||
|
||||
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
||||
---
|
||||
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++----------------
|
||||
1 file changed, 33 insertions(+), 83 deletions(-)
|
||||
|
||||
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
@@ -56,20 +56,10 @@
|
||||
.quad .Lcrc_\i
|
||||
.endm
|
||||
|
||||
-.macro JNC_LESS_THAN j
|
||||
- jnc .Lless_than_\j
|
||||
-.endm
|
||||
-
|
||||
-# Define threshold where buffers are considered "small" and routed to more
|
||||
-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
|
||||
-# SMALL_SIZE can be no larger than 255.
|
||||
-
|
||||
+# Define threshold below which buffers are considered "small" and routed to
|
||||
+# regular CRC code that does not interleave the CRC instructions.
|
||||
#define SMALL_SIZE 200
|
||||
|
||||
-.if (SMALL_SIZE > 255)
|
||||
-.error "SMALL_ SIZE must be < 256"
|
||||
-.endif
|
||||
-
|
||||
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
|
||||
|
||||
.text
|
||||
@@ -100,25 +90,18 @@ SYM_FUNC_START(crc_pcl)
|
||||
## Move crc_init for Linux to a different
|
||||
mov crc_init_arg, crc_init
|
||||
|
||||
+ mov %bufp, bufptmp # rdi = *buf
|
||||
+ cmp $SMALL_SIZE, len
|
||||
+ jb .Lsmall
|
||||
+
|
||||
################################################################
|
||||
## 1) ALIGN:
|
||||
################################################################
|
||||
-
|
||||
- mov %bufp, bufptmp # rdi = *buf
|
||||
neg %bufp
|
||||
and $7, %bufp # calculate the unalignment amount of
|
||||
# the address
|
||||
je .Lproc_block # Skip if aligned
|
||||
|
||||
- ## If len is less than 8 and we're unaligned, we need to jump
|
||||
- ## to special code to avoid reading beyond the end of the buffer
|
||||
- cmp $8, len
|
||||
- jae .Ldo_align
|
||||
- # less_than_8 expects length in upper 3 bits of len_dw
|
||||
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||||
- shl $32-3+1, len_dw
|
||||
- jmp .Lless_than_8_post_shl1
|
||||
-
|
||||
.Ldo_align:
|
||||
#### Calculate CRC of unaligned bytes of the buffer (if any)
|
||||
movq (bufptmp), tmp # load a quadward from the buffer
|
||||
@@ -144,9 +127,6 @@ SYM_FUNC_START(crc_pcl)
|
||||
jae .Lfull_block
|
||||
|
||||
.Lcontinue_block:
|
||||
- cmpq $SMALL_SIZE, len
|
||||
- jb .Lsmall
|
||||
-
|
||||
## len < 128*24
|
||||
movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||||
mul len_dw
|
||||
@@ -243,68 +223,38 @@ LABEL crc_ 0
|
||||
mov tmp, len
|
||||
cmp $128*24, tmp
|
||||
jae .Lfull_block
|
||||
- cmp $24, tmp
|
||||
+ cmp $SMALL_SIZE, tmp
|
||||
jae .Lcontinue_block
|
||||
|
||||
-.Lless_than_24:
|
||||
- shl $32-4, len_dw # less_than_16 expects length
|
||||
- # in upper 4 bits of len_dw
|
||||
- jnc .Lless_than_16
|
||||
- crc32q (bufptmp), crc_init
|
||||
- crc32q 8(bufptmp), crc_init
|
||||
- jz .Ldo_return
|
||||
- add $16, bufptmp
|
||||
- # len is less than 8 if we got here
|
||||
- # less_than_8 expects length in upper 3 bits of len_dw
|
||||
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||||
- shl $2, len_dw
|
||||
- jmp .Lless_than_8_post_shl1
|
||||
-
|
||||
#######################################################################
|
||||
- ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
|
||||
+ ## 6) Process any remainder without interleaving:
|
||||
#######################################################################
|
||||
.Lsmall:
|
||||
- shl $32-8, len_dw # Prepare len_dw for less_than_256
|
||||
- j=256
|
||||
-.rept 5 # j = {256, 128, 64, 32, 16}
|
||||
-.altmacro
|
||||
-LABEL less_than_ %j # less_than_j: Length should be in
|
||||
- # upper lg(j) bits of len_dw
|
||||
- j=(j/2)
|
||||
- shl $1, len_dw # Get next MSB
|
||||
- JNC_LESS_THAN %j
|
||||
-.noaltmacro
|
||||
- i=0
|
||||
-.rept (j/8)
|
||||
- crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
|
||||
- i=i+8
|
||||
-.endr
|
||||
- jz .Ldo_return # Return if remaining length is zero
|
||||
- add $j, bufptmp # Advance buf
|
||||
-.endr
|
||||
-
|
||||
-.Lless_than_8: # Length should be stored in
|
||||
- # upper 3 bits of len_dw
|
||||
- shl $1, len_dw
|
||||
-.Lless_than_8_post_shl1:
|
||||
- jnc .Lless_than_4
|
||||
- crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
|
||||
- jz .Ldo_return # return if remaining data is zero
|
||||
- add $4, bufptmp
|
||||
-.Lless_than_4: # Length should be stored in
|
||||
- # upper 2 bits of len_dw
|
||||
- shl $1, len_dw
|
||||
- jnc .Lless_than_2
|
||||
- crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
|
||||
- jz .Ldo_return # return if remaining data is zero
|
||||
- add $2, bufptmp
|
||||
-.Lless_than_2: # Length should be stored in the MSB
|
||||
- # of len_dw
|
||||
- shl $1, len_dw
|
||||
- jnc .Lless_than_1
|
||||
- crc32b (bufptmp), crc_init_dw # CRC of 1 byte
|
||||
-.Lless_than_1: # Length should be zero
|
||||
-.Ldo_return:
|
||||
+ test len, len
|
||||
+ jz .Ldone
|
||||
+ mov len_dw, %eax
|
||||
+ shr $3, %eax
|
||||
+ jz .Ldo_dword
|
||||
+.Ldo_qwords:
|
||||
+ crc32q (bufptmp), crc_init
|
||||
+ add $8, bufptmp
|
||||
+ dec %eax
|
||||
+ jnz .Ldo_qwords
|
||||
+.Ldo_dword:
|
||||
+ test $4, len_dw
|
||||
+ jz .Ldo_word
|
||||
+ crc32l (bufptmp), crc_init_dw
|
||||
+ add $4, bufptmp
|
||||
+.Ldo_word:
|
||||
+ test $2, len_dw
|
||||
+ jz .Ldo_byte
|
||||
+ crc32w (bufptmp), crc_init_dw
|
||||
+ add $2, bufptmp
|
||||
+.Ldo_byte:
|
||||
+ test $1, len_dw
|
||||
+ jz .Ldone
|
||||
+ crc32b (bufptmp), crc_init_dw
|
||||
+.Ldone:
|
||||
movq crc_init, %rax
|
||||
popq %rsi
|
||||
popq %rdi
|
1857
debian/patches/patchset-pf/crypto/0002-crypto-x86-aes-ctr-rewrite-AESNI-AVX-optimized-CTR-a.patch
vendored
Normal file
1857
debian/patches/patchset-pf/crypto/0002-crypto-x86-aes-ctr-rewrite-AESNI-AVX-optimized-CTR-a.patch
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,187 +0,0 @@
|
||||
From 3ed4205afe9305d71d055554ba27e7b8923865dc Mon Sep 17 00:00:00 2001
|
||||
From: Eric Biggers <ebiggers@google.com>
|
||||
Date: Sun, 13 Oct 2024 21:06:49 -0700
|
||||
Subject: crypto: x86/crc32c - access 32-bit arguments as 32-bit
|
||||
|
||||
Fix crc32c-pcl-intel-asm_64.S to access 32-bit arguments as 32-bit
|
||||
values instead of 64-bit, since the upper bits of the corresponding
|
||||
64-bit registers are not guaranteed to be zero. Also update the type of
|
||||
the length argument to be unsigned int rather than int, as the assembly
|
||||
code treats it as unsigned.
|
||||
|
||||
Note: there haven't been any reports of this bug actually causing
|
||||
incorrect behavior. Neither gcc nor clang guarantee zero-extension to
|
||||
64 bits, but zero-extension is likely to happen in practice because most
|
||||
instructions that operate on 32-bit registers zero-extend to 64 bits.
|
||||
|
||||
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
||||
---
|
||||
arch/x86/crypto/crc32c-intel_glue.c | 2 +-
|
||||
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 57 +++++++++++------------
|
||||
2 files changed, 27 insertions(+), 32 deletions(-)
|
||||
|
||||
--- a/arch/x86/crypto/crc32c-intel_glue.c
|
||||
+++ b/arch/x86/crypto/crc32c-intel_glue.c
|
||||
@@ -41,7 +41,7 @@
|
||||
*/
|
||||
#define CRC32C_PCL_BREAKEVEN 512
|
||||
|
||||
-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
|
||||
+asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
|
||||
unsigned int crc_init);
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
@@ -60,7 +60,7 @@
|
||||
# regular CRC code that does not interleave the CRC instructions.
|
||||
#define SMALL_SIZE 200
|
||||
|
||||
-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
|
||||
+# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
|
||||
|
||||
.text
|
||||
SYM_FUNC_START(crc_pcl)
|
||||
@@ -72,14 +72,11 @@ SYM_FUNC_START(crc_pcl)
|
||||
#define block_0 %rcx
|
||||
#define block_1 %rdx
|
||||
#define block_2 %r11
|
||||
-#define len %rsi
|
||||
-#define len_dw %esi
|
||||
-#define len_w %si
|
||||
-#define len_b %sil
|
||||
-#define crc_init_arg %rdx
|
||||
+#define len %esi
|
||||
+#define crc_init_arg %edx
|
||||
#define tmp %rbx
|
||||
-#define crc_init %r8
|
||||
-#define crc_init_dw %r8d
|
||||
+#define crc_init %r8d
|
||||
+#define crc_init_q %r8
|
||||
#define crc1 %r9
|
||||
#define crc2 %r10
|
||||
|
||||
@@ -107,9 +104,9 @@ SYM_FUNC_START(crc_pcl)
|
||||
movq (bufptmp), tmp # load a quadward from the buffer
|
||||
add %bufp, bufptmp # align buffer pointer for quadword
|
||||
# processing
|
||||
- sub %bufp, len # update buffer length
|
||||
+ sub bufp_dw, len # update buffer length
|
||||
.Lalign_loop:
|
||||
- crc32b %bl, crc_init_dw # compute crc32 of 1-byte
|
||||
+ crc32b %bl, crc_init # compute crc32 of 1-byte
|
||||
shr $8, tmp # get next byte
|
||||
dec %bufp
|
||||
jne .Lalign_loop
|
||||
@@ -121,15 +118,14 @@ SYM_FUNC_START(crc_pcl)
|
||||
################################################################
|
||||
|
||||
## compute num of bytes to be processed
|
||||
- movq len, tmp # save num bytes in tmp
|
||||
|
||||
- cmpq $128*24, len
|
||||
+ cmp $128*24, len
|
||||
jae .Lfull_block
|
||||
|
||||
.Lcontinue_block:
|
||||
## len < 128*24
|
||||
movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||||
- mul len_dw
|
||||
+ mul len
|
||||
shrq $16, %rax
|
||||
|
||||
## eax contains floor(bytes / 24) = num 24-byte chunks to do
|
||||
@@ -176,7 +172,7 @@ SYM_FUNC_START(crc_pcl)
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
ENDBR
|
||||
- crc32q -i*8(block_0), crc_init
|
||||
+ crc32q -i*8(block_0), crc_init_q
|
||||
crc32q -i*8(block_1), crc1
|
||||
crc32q -i*8(block_2), crc2
|
||||
i=(i-1)
|
||||
@@ -186,7 +182,7 @@ LABEL crc_ %i
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
ENDBR
|
||||
- crc32q -i*8(block_0), crc_init
|
||||
+ crc32q -i*8(block_0), crc_init_q
|
||||
crc32q -i*8(block_1), crc1
|
||||
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
|
||||
|
||||
@@ -200,9 +196,9 @@ LABEL crc_ %i
|
||||
shlq $3, %rax # rax *= 8
|
||||
pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
|
||||
leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
|
||||
- subq %rax, tmp # tmp -= rax*24
|
||||
+ sub %eax, len # len -= rax*24
|
||||
|
||||
- movq crc_init, %xmm1 # CRC for block 1
|
||||
+ movq crc_init_q, %xmm1 # CRC for block 1
|
||||
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
|
||||
|
||||
movq crc1, %xmm2 # CRC for block 2
|
||||
@@ -211,8 +207,8 @@ LABEL crc_ %i
|
||||
pxor %xmm2,%xmm1
|
||||
movq %xmm1, %rax
|
||||
xor -i*8(block_2), %rax
|
||||
- mov crc2, crc_init
|
||||
- crc32 %rax, crc_init
|
||||
+ mov crc2, crc_init_q
|
||||
+ crc32 %rax, crc_init_q
|
||||
|
||||
################################################################
|
||||
## 5) Check for end:
|
||||
@@ -220,10 +216,9 @@ LABEL crc_ %i
|
||||
|
||||
LABEL crc_ 0
|
||||
ENDBR
|
||||
- mov tmp, len
|
||||
- cmp $128*24, tmp
|
||||
+ cmp $128*24, len
|
||||
jae .Lfull_block
|
||||
- cmp $SMALL_SIZE, tmp
|
||||
+ cmp $SMALL_SIZE, len
|
||||
jae .Lcontinue_block
|
||||
|
||||
#######################################################################
|
||||
@@ -232,30 +227,30 @@ LABEL crc_ 0
|
||||
.Lsmall:
|
||||
test len, len
|
||||
jz .Ldone
|
||||
- mov len_dw, %eax
|
||||
+ mov len, %eax
|
||||
shr $3, %eax
|
||||
jz .Ldo_dword
|
||||
.Ldo_qwords:
|
||||
- crc32q (bufptmp), crc_init
|
||||
+ crc32q (bufptmp), crc_init_q
|
||||
add $8, bufptmp
|
||||
dec %eax
|
||||
jnz .Ldo_qwords
|
||||
.Ldo_dword:
|
||||
- test $4, len_dw
|
||||
+ test $4, len
|
||||
jz .Ldo_word
|
||||
- crc32l (bufptmp), crc_init_dw
|
||||
+ crc32l (bufptmp), crc_init
|
||||
add $4, bufptmp
|
||||
.Ldo_word:
|
||||
- test $2, len_dw
|
||||
+ test $2, len
|
||||
jz .Ldo_byte
|
||||
- crc32w (bufptmp), crc_init_dw
|
||||
+ crc32w (bufptmp), crc_init
|
||||
add $2, bufptmp
|
||||
.Ldo_byte:
|
||||
- test $1, len_dw
|
||||
+ test $1, len
|
||||
jz .Ldone
|
||||
- crc32b (bufptmp), crc_init_dw
|
||||
+ crc32b (bufptmp), crc_init
|
||||
.Ldone:
|
||||
- movq crc_init, %rax
|
||||
+ mov crc_init, %eax
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbx
|
@@ -1,374 +0,0 @@
|
||||
From 5ffad9b234995f73548763a8487ecd256bba8d8d Mon Sep 17 00:00:00 2001
|
||||
From: Eric Biggers <ebiggers@google.com>
|
||||
Date: Sun, 13 Oct 2024 21:06:49 -0700
|
||||
Subject: crypto: x86/crc32c - eliminate jump table and excessive unrolling
|
||||
|
||||
crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully
|
||||
unrolled and uses a jump table to jump into the correct location. This
|
||||
optimization is misguided, as it bloats the binary code size and
|
||||
introduces an indirect call. x86_64 CPUs can predict loops well, so it
|
||||
is fine to just use a loop instead. Loop bookkeeping instructions can
|
||||
compete with the crc instructions for the ALUs, but this is easily
|
||||
mitigated by unrolling the loop by a smaller amount, such as 4 times.
|
||||
|
||||
Therefore, re-roll the loop and make related tweaks to the code.
|
||||
|
||||
This reduces the binary code size of crc_pclmul() from 4546 bytes to 418
|
||||
bytes, a 91% reduction. In general it also makes the code faster, with
|
||||
some large improvements seen when retpoline is enabled.
|
||||
|
||||
More detailed performance results are shown below. They are given as
|
||||
percent improvement in throughput (negative means regressed) for CPU
|
||||
microarchitecture vs. input length in bytes. E.g. an improvement from
|
||||
40 GB/s to 50 GB/s would be listed as 25%.
|
||||
|
||||
Table 1: Results with retpoline enabled (the default):
|
||||
|
||||
| 512 | 833 | 1024 | 2000 | 3173 | 4096 |
|
||||
---------------------+-------+-------+-------+------ +-------+-------+
|
||||
Intel Haswell | 35.0% | 20.7% | 17.8% | 9.7% | -0.2% | 4.4% |
|
||||
Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% | 0.0% | 5.4% |
|
||||
AMD Zen 2 | 29.5% | 17.2% | 13.5% | 8.6% | -0.5% | 2.8% |
|
||||
|
||||
Table 2: Results with retpoline disabled:
|
||||
|
||||
| 512 | 833 | 1024 | 2000 | 3173 | 4096 |
|
||||
---------------------+-------+-------+-------+------ +-------+-------+
|
||||
Intel Haswell | 3.3% | 4.8% | 4.5% | 0.9% | -2.9% | 0.3% |
|
||||
Intel Emerald Rapids | 7.5% | 6.4% | 5.2% | 2.3% | -0.0% | 0.6% |
|
||||
AMD Zen 2 | 11.8% | 1.4% | 0.2% | 1.3% | -0.9% | -0.2% |
|
||||
|
||||
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
||||
---
|
||||
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 +++++++++-------------
|
||||
1 file changed, 92 insertions(+), 141 deletions(-)
|
||||
|
||||
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
@@ -7,6 +7,7 @@
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
|
||||
*
|
||||
* Copyright (C) 2012 Intel Corporation.
|
||||
+ * Copyright 2024 Google LLC
|
||||
*
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
@@ -44,18 +45,9 @@
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
-#include <asm/nospec-branch.h>
|
||||
|
||||
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
|
||||
|
||||
-.macro LABEL prefix n
|
||||
-.L\prefix\n\():
|
||||
-.endm
|
||||
-
|
||||
-.macro JMPTBL_ENTRY i
|
||||
-.quad .Lcrc_\i
|
||||
-.endm
|
||||
-
|
||||
# Define threshold below which buffers are considered "small" and routed to
|
||||
# regular CRC code that does not interleave the CRC instructions.
|
||||
#define SMALL_SIZE 200
|
||||
@@ -64,139 +56,116 @@
|
||||
|
||||
.text
|
||||
SYM_FUNC_START(crc_pcl)
|
||||
-#define bufp rdi
|
||||
-#define bufp_dw %edi
|
||||
-#define bufp_w %di
|
||||
-#define bufp_b %dil
|
||||
-#define bufptmp %rcx
|
||||
-#define block_0 %rcx
|
||||
-#define block_1 %rdx
|
||||
-#define block_2 %r11
|
||||
-#define len %esi
|
||||
-#define crc_init_arg %edx
|
||||
-#define tmp %rbx
|
||||
-#define crc_init %r8d
|
||||
-#define crc_init_q %r8
|
||||
-#define crc1 %r9
|
||||
-#define crc2 %r10
|
||||
-
|
||||
- pushq %rbx
|
||||
- pushq %rdi
|
||||
- pushq %rsi
|
||||
-
|
||||
- ## Move crc_init for Linux to a different
|
||||
- mov crc_init_arg, crc_init
|
||||
+#define bufp %rdi
|
||||
+#define bufp_d %edi
|
||||
+#define len %esi
|
||||
+#define crc_init %edx
|
||||
+#define crc_init_q %rdx
|
||||
+#define n_misaligned %ecx /* overlaps chunk_bytes! */
|
||||
+#define n_misaligned_q %rcx
|
||||
+#define chunk_bytes %ecx /* overlaps n_misaligned! */
|
||||
+#define chunk_bytes_q %rcx
|
||||
+#define crc1 %r8
|
||||
+#define crc2 %r9
|
||||
|
||||
- mov %bufp, bufptmp # rdi = *buf
|
||||
cmp $SMALL_SIZE, len
|
||||
jb .Lsmall
|
||||
|
||||
################################################################
|
||||
## 1) ALIGN:
|
||||
################################################################
|
||||
- neg %bufp
|
||||
- and $7, %bufp # calculate the unalignment amount of
|
||||
+ mov bufp_d, n_misaligned
|
||||
+ neg n_misaligned
|
||||
+ and $7, n_misaligned # calculate the misalignment amount of
|
||||
# the address
|
||||
- je .Lproc_block # Skip if aligned
|
||||
+ je .Laligned # Skip if aligned
|
||||
|
||||
+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
|
||||
+ # the remaining data to an 8-byte boundary.
|
||||
.Ldo_align:
|
||||
- #### Calculate CRC of unaligned bytes of the buffer (if any)
|
||||
- movq (bufptmp), tmp # load a quadward from the buffer
|
||||
- add %bufp, bufptmp # align buffer pointer for quadword
|
||||
- # processing
|
||||
- sub bufp_dw, len # update buffer length
|
||||
+ movq (bufp), %rax
|
||||
+ add n_misaligned_q, bufp
|
||||
+ sub n_misaligned, len
|
||||
.Lalign_loop:
|
||||
- crc32b %bl, crc_init # compute crc32 of 1-byte
|
||||
- shr $8, tmp # get next byte
|
||||
- dec %bufp
|
||||
+ crc32b %al, crc_init # compute crc32 of 1-byte
|
||||
+ shr $8, %rax # get next byte
|
||||
+ dec n_misaligned
|
||||
jne .Lalign_loop
|
||||
-
|
||||
-.Lproc_block:
|
||||
+.Laligned:
|
||||
|
||||
################################################################
|
||||
- ## 2) PROCESS BLOCKS:
|
||||
+ ## 2) PROCESS BLOCK:
|
||||
################################################################
|
||||
|
||||
- ## compute num of bytes to be processed
|
||||
-
|
||||
cmp $128*24, len
|
||||
jae .Lfull_block
|
||||
|
||||
-.Lcontinue_block:
|
||||
- ## len < 128*24
|
||||
- movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||||
- mul len
|
||||
- shrq $16, %rax
|
||||
-
|
||||
- ## eax contains floor(bytes / 24) = num 24-byte chunks to do
|
||||
-
|
||||
- ## process rax 24-byte chunks (128 >= rax >= 0)
|
||||
-
|
||||
- ## compute end address of each block
|
||||
- ## block 0 (base addr + RAX * 8)
|
||||
- ## block 1 (base addr + RAX * 16)
|
||||
- ## block 2 (base addr + RAX * 24)
|
||||
- lea (bufptmp, %rax, 8), block_0
|
||||
- lea (block_0, %rax, 8), block_1
|
||||
- lea (block_1, %rax, 8), block_2
|
||||
-
|
||||
- xor crc1, crc1
|
||||
- xor crc2, crc2
|
||||
-
|
||||
- ## branch into array
|
||||
- leaq jump_table(%rip), %bufp
|
||||
- mov (%bufp,%rax,8), %bufp
|
||||
- JMP_NOSPEC bufp
|
||||
+.Lpartial_block:
|
||||
+ # Compute floor(len / 24) to get num qwords to process from each lane.
|
||||
+ imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
|
||||
+ shr $16, %eax
|
||||
+ jmp .Lcrc_3lanes
|
||||
|
||||
- ################################################################
|
||||
- ## 2a) PROCESS FULL BLOCKS:
|
||||
- ################################################################
|
||||
.Lfull_block:
|
||||
- movl $128,%eax
|
||||
- lea 128*8*2(block_0), block_1
|
||||
- lea 128*8*3(block_0), block_2
|
||||
- add $128*8*1, block_0
|
||||
-
|
||||
- xor crc1,crc1
|
||||
- xor crc2,crc2
|
||||
-
|
||||
- # Fall through into top of crc array (crc_128)
|
||||
+ # Processing 128 qwords from each lane.
|
||||
+ mov $128, %eax
|
||||
|
||||
################################################################
|
||||
- ## 3) CRC Array:
|
||||
+ ## 3) CRC each of three lanes:
|
||||
################################################################
|
||||
|
||||
- i=128
|
||||
-.rept 128-1
|
||||
-.altmacro
|
||||
-LABEL crc_ %i
|
||||
-.noaltmacro
|
||||
- ENDBR
|
||||
- crc32q -i*8(block_0), crc_init_q
|
||||
- crc32q -i*8(block_1), crc1
|
||||
- crc32q -i*8(block_2), crc2
|
||||
- i=(i-1)
|
||||
-.endr
|
||||
-
|
||||
-.altmacro
|
||||
-LABEL crc_ %i
|
||||
-.noaltmacro
|
||||
- ENDBR
|
||||
- crc32q -i*8(block_0), crc_init_q
|
||||
- crc32q -i*8(block_1), crc1
|
||||
-# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
|
||||
+.Lcrc_3lanes:
|
||||
+ xor crc1,crc1
|
||||
+ xor crc2,crc2
|
||||
+ mov %eax, chunk_bytes
|
||||
+ shl $3, chunk_bytes # num bytes to process from each lane
|
||||
+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter
|
||||
+ jl .Lcrc_3lanes_4x_done
|
||||
+
|
||||
+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
|
||||
+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
|
||||
+.Lcrc_3lanes_4x_loop:
|
||||
+ crc32q (bufp), crc_init_q
|
||||
+ crc32q (bufp,chunk_bytes_q), crc1
|
||||
+ crc32q (bufp,chunk_bytes_q,2), crc2
|
||||
+ crc32q 8(bufp), crc_init_q
|
||||
+ crc32q 8(bufp,chunk_bytes_q), crc1
|
||||
+ crc32q 8(bufp,chunk_bytes_q,2), crc2
|
||||
+ crc32q 16(bufp), crc_init_q
|
||||
+ crc32q 16(bufp,chunk_bytes_q), crc1
|
||||
+ crc32q 16(bufp,chunk_bytes_q,2), crc2
|
||||
+ crc32q 24(bufp), crc_init_q
|
||||
+ crc32q 24(bufp,chunk_bytes_q), crc1
|
||||
+ crc32q 24(bufp,chunk_bytes_q,2), crc2
|
||||
+ add $32, bufp
|
||||
+ sub $4, %eax
|
||||
+ jge .Lcrc_3lanes_4x_loop
|
||||
+
|
||||
+.Lcrc_3lanes_4x_done:
|
||||
+ add $4, %eax
|
||||
+ jz .Lcrc_3lanes_last_qword
|
||||
+
|
||||
+.Lcrc_3lanes_1x_loop:
|
||||
+ crc32q (bufp), crc_init_q
|
||||
+ crc32q (bufp,chunk_bytes_q), crc1
|
||||
+ crc32q (bufp,chunk_bytes_q,2), crc2
|
||||
+ add $8, bufp
|
||||
+ dec %eax
|
||||
+ jnz .Lcrc_3lanes_1x_loop
|
||||
|
||||
- mov block_2, block_0
|
||||
+.Lcrc_3lanes_last_qword:
|
||||
+ crc32q (bufp), crc_init_q
|
||||
+ crc32q (bufp,chunk_bytes_q), crc1
|
||||
+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
|
||||
|
||||
################################################################
|
||||
## 4) Combine three results:
|
||||
################################################################
|
||||
|
||||
- lea (K_table-8)(%rip), %bufp # first entry is for idx 1
|
||||
- shlq $3, %rax # rax *= 8
|
||||
- pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
|
||||
- leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
|
||||
- sub %eax, len # len -= rax*24
|
||||
+ lea (K_table-8)(%rip), %rax # first entry is for idx 1
|
||||
+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
|
||||
+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
|
||||
+ sub %eax, len # len -= chunk_bytes * 3
|
||||
|
||||
movq crc_init_q, %xmm1 # CRC for block 1
|
||||
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
|
||||
@@ -206,20 +175,19 @@ LABEL crc_ %i
|
||||
|
||||
pxor %xmm2,%xmm1
|
||||
movq %xmm1, %rax
|
||||
- xor -i*8(block_2), %rax
|
||||
+ xor (bufp,chunk_bytes_q,2), %rax
|
||||
mov crc2, crc_init_q
|
||||
crc32 %rax, crc_init_q
|
||||
+ lea 8(bufp,chunk_bytes_q,2), bufp
|
||||
|
||||
################################################################
|
||||
- ## 5) Check for end:
|
||||
+ ## 5) If more blocks remain, goto (2):
|
||||
################################################################
|
||||
|
||||
-LABEL crc_ 0
|
||||
- ENDBR
|
||||
cmp $128*24, len
|
||||
- jae .Lfull_block
|
||||
+ jae .Lfull_block
|
||||
cmp $SMALL_SIZE, len
|
||||
- jae .Lcontinue_block
|
||||
+ jae .Lpartial_block
|
||||
|
||||
#######################################################################
|
||||
## 6) Process any remainder without interleaving:
|
||||
@@ -231,47 +199,30 @@ LABEL crc_ 0
|
||||
shr $3, %eax
|
||||
jz .Ldo_dword
|
||||
.Ldo_qwords:
|
||||
- crc32q (bufptmp), crc_init_q
|
||||
- add $8, bufptmp
|
||||
+ crc32q (bufp), crc_init_q
|
||||
+ add $8, bufp
|
||||
dec %eax
|
||||
jnz .Ldo_qwords
|
||||
.Ldo_dword:
|
||||
test $4, len
|
||||
jz .Ldo_word
|
||||
- crc32l (bufptmp), crc_init
|
||||
- add $4, bufptmp
|
||||
+ crc32l (bufp), crc_init
|
||||
+ add $4, bufp
|
||||
.Ldo_word:
|
||||
test $2, len
|
||||
jz .Ldo_byte
|
||||
- crc32w (bufptmp), crc_init
|
||||
- add $2, bufptmp
|
||||
+ crc32w (bufp), crc_init
|
||||
+ add $2, bufp
|
||||
.Ldo_byte:
|
||||
test $1, len
|
||||
jz .Ldone
|
||||
- crc32b (bufptmp), crc_init
|
||||
+ crc32b (bufp), crc_init
|
||||
.Ldone:
|
||||
mov crc_init, %eax
|
||||
- popq %rsi
|
||||
- popq %rdi
|
||||
- popq %rbx
|
||||
RET
|
||||
SYM_FUNC_END(crc_pcl)
|
||||
|
||||
.section .rodata, "a", @progbits
|
||||
- ################################################################
|
||||
- ## jump table Table is 129 entries x 2 bytes each
|
||||
- ################################################################
|
||||
-.align 4
|
||||
-jump_table:
|
||||
- i=0
|
||||
-.rept 129
|
||||
-.altmacro
|
||||
-JMPTBL_ENTRY %i
|
||||
-.noaltmacro
|
||||
- i=i+1
|
||||
-.endr
|
||||
-
|
||||
-
|
||||
################################################################
|
||||
## PCLMULQDQ tables
|
||||
## Table is 128 entries x 2 words (8 bytes) each
|
Reference in New Issue
Block a user