2024-12-10 06:44:25 +03:00
|
|
|
From 0a957679a29a06fb2e3971615ff9f05f6becb941 Mon Sep 17 00:00:00 2001
|
2024-10-29 05:12:06 +03:00
|
|
|
From: Eric Biggers <ebiggers@google.com>
|
|
|
|
Date: Sun, 13 Oct 2024 21:06:49 -0700
|
|
|
|
Subject: crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
|
|
|
|
|
|
|
|
The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for
|
|
|
|
lengths >= 512, due to the overhead of saving and restoring FPU state.
|
|
|
|
Therefore, it is unnecessary for this code to be excessively "optimized"
|
|
|
|
for lengths < 200. Eliminate the excessive unrolling of this part of
|
|
|
|
the code and use a more straightforward qword-at-a-time loop.
|
|
|
|
|
|
|
|
Note: the part of the code in question is not entirely redundant, as it
|
|
|
|
is still used to process any remainder mod 24, as well as any remaining
|
|
|
|
data when fewer than 200 bytes remain after least one 3072-byte chunk.
|
|
|
|
|
|
|
|
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
|
|
|
---
|
|
|
|
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++----------------
|
|
|
|
1 file changed, 33 insertions(+), 83 deletions(-)
|
|
|
|
|
|
|
|
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
|
|
|
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
|
|
|
@@ -56,20 +56,10 @@
|
|
|
|
.quad .Lcrc_\i
|
|
|
|
.endm
|
|
|
|
|
|
|
|
-.macro JNC_LESS_THAN j
|
|
|
|
- jnc .Lless_than_\j
|
|
|
|
-.endm
|
|
|
|
-
|
|
|
|
-# Define threshold where buffers are considered "small" and routed to more
|
|
|
|
-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
|
|
|
|
-# SMALL_SIZE can be no larger than 255.
|
|
|
|
-
|
|
|
|
+# Define threshold below which buffers are considered "small" and routed to
|
|
|
|
+# regular CRC code that does not interleave the CRC instructions.
|
|
|
|
#define SMALL_SIZE 200
|
|
|
|
|
|
|
|
-.if (SMALL_SIZE > 255)
|
|
|
|
-.error "SMALL_ SIZE must be < 256"
|
|
|
|
-.endif
|
|
|
|
-
|
|
|
|
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
|
|
|
|
|
|
|
|
.text
|
|
|
|
@@ -100,25 +90,18 @@ SYM_FUNC_START(crc_pcl)
|
|
|
|
## Move crc_init for Linux to a different
|
|
|
|
mov crc_init_arg, crc_init
|
|
|
|
|
|
|
|
+ mov %bufp, bufptmp # rdi = *buf
|
|
|
|
+ cmp $SMALL_SIZE, len
|
|
|
|
+ jb .Lsmall
|
|
|
|
+
|
|
|
|
################################################################
|
|
|
|
## 1) ALIGN:
|
|
|
|
################################################################
|
|
|
|
-
|
|
|
|
- mov %bufp, bufptmp # rdi = *buf
|
|
|
|
neg %bufp
|
|
|
|
and $7, %bufp # calculate the unalignment amount of
|
|
|
|
# the address
|
|
|
|
je .Lproc_block # Skip if aligned
|
|
|
|
|
|
|
|
- ## If len is less than 8 and we're unaligned, we need to jump
|
|
|
|
- ## to special code to avoid reading beyond the end of the buffer
|
|
|
|
- cmp $8, len
|
|
|
|
- jae .Ldo_align
|
|
|
|
- # less_than_8 expects length in upper 3 bits of len_dw
|
|
|
|
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
|
|
|
- shl $32-3+1, len_dw
|
|
|
|
- jmp .Lless_than_8_post_shl1
|
|
|
|
-
|
|
|
|
.Ldo_align:
|
|
|
|
#### Calculate CRC of unaligned bytes of the buffer (if any)
|
|
|
|
movq (bufptmp), tmp # load a quadward from the buffer
|
|
|
|
@@ -144,9 +127,6 @@ SYM_FUNC_START(crc_pcl)
|
|
|
|
jae .Lfull_block
|
|
|
|
|
|
|
|
.Lcontinue_block:
|
|
|
|
- cmpq $SMALL_SIZE, len
|
|
|
|
- jb .Lsmall
|
|
|
|
-
|
|
|
|
## len < 128*24
|
|
|
|
movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
|
|
|
mul len_dw
|
|
|
|
@@ -243,68 +223,38 @@ LABEL crc_ 0
|
|
|
|
mov tmp, len
|
|
|
|
cmp $128*24, tmp
|
|
|
|
jae .Lfull_block
|
|
|
|
- cmp $24, tmp
|
|
|
|
+ cmp $SMALL_SIZE, tmp
|
|
|
|
jae .Lcontinue_block
|
|
|
|
|
|
|
|
-.Lless_than_24:
|
|
|
|
- shl $32-4, len_dw # less_than_16 expects length
|
|
|
|
- # in upper 4 bits of len_dw
|
|
|
|
- jnc .Lless_than_16
|
|
|
|
- crc32q (bufptmp), crc_init
|
|
|
|
- crc32q 8(bufptmp), crc_init
|
|
|
|
- jz .Ldo_return
|
|
|
|
- add $16, bufptmp
|
|
|
|
- # len is less than 8 if we got here
|
|
|
|
- # less_than_8 expects length in upper 3 bits of len_dw
|
|
|
|
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
|
|
|
- shl $2, len_dw
|
|
|
|
- jmp .Lless_than_8_post_shl1
|
|
|
|
-
|
|
|
|
#######################################################################
|
|
|
|
- ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
|
|
|
|
+ ## 6) Process any remainder without interleaving:
|
|
|
|
#######################################################################
|
|
|
|
.Lsmall:
|
|
|
|
- shl $32-8, len_dw # Prepare len_dw for less_than_256
|
|
|
|
- j=256
|
|
|
|
-.rept 5 # j = {256, 128, 64, 32, 16}
|
|
|
|
-.altmacro
|
|
|
|
-LABEL less_than_ %j # less_than_j: Length should be in
|
|
|
|
- # upper lg(j) bits of len_dw
|
|
|
|
- j=(j/2)
|
|
|
|
- shl $1, len_dw # Get next MSB
|
|
|
|
- JNC_LESS_THAN %j
|
|
|
|
-.noaltmacro
|
|
|
|
- i=0
|
|
|
|
-.rept (j/8)
|
|
|
|
- crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
|
|
|
|
- i=i+8
|
|
|
|
-.endr
|
|
|
|
- jz .Ldo_return # Return if remaining length is zero
|
|
|
|
- add $j, bufptmp # Advance buf
|
|
|
|
-.endr
|
|
|
|
-
|
|
|
|
-.Lless_than_8: # Length should be stored in
|
|
|
|
- # upper 3 bits of len_dw
|
|
|
|
- shl $1, len_dw
|
|
|
|
-.Lless_than_8_post_shl1:
|
|
|
|
- jnc .Lless_than_4
|
|
|
|
- crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
|
|
|
|
- jz .Ldo_return # return if remaining data is zero
|
|
|
|
- add $4, bufptmp
|
|
|
|
-.Lless_than_4: # Length should be stored in
|
|
|
|
- # upper 2 bits of len_dw
|
|
|
|
- shl $1, len_dw
|
|
|
|
- jnc .Lless_than_2
|
|
|
|
- crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
|
|
|
|
- jz .Ldo_return # return if remaining data is zero
|
|
|
|
- add $2, bufptmp
|
|
|
|
-.Lless_than_2: # Length should be stored in the MSB
|
|
|
|
- # of len_dw
|
|
|
|
- shl $1, len_dw
|
|
|
|
- jnc .Lless_than_1
|
|
|
|
- crc32b (bufptmp), crc_init_dw # CRC of 1 byte
|
|
|
|
-.Lless_than_1: # Length should be zero
|
|
|
|
-.Ldo_return:
|
|
|
|
+ test len, len
|
|
|
|
+ jz .Ldone
|
|
|
|
+ mov len_dw, %eax
|
|
|
|
+ shr $3, %eax
|
|
|
|
+ jz .Ldo_dword
|
|
|
|
+.Ldo_qwords:
|
|
|
|
+ crc32q (bufptmp), crc_init
|
|
|
|
+ add $8, bufptmp
|
|
|
|
+ dec %eax
|
|
|
|
+ jnz .Ldo_qwords
|
|
|
|
+.Ldo_dword:
|
|
|
|
+ test $4, len_dw
|
|
|
|
+ jz .Ldo_word
|
|
|
|
+ crc32l (bufptmp), crc_init_dw
|
|
|
|
+ add $4, bufptmp
|
|
|
|
+.Ldo_word:
|
|
|
|
+ test $2, len_dw
|
|
|
|
+ jz .Ldo_byte
|
|
|
|
+ crc32w (bufptmp), crc_init_dw
|
|
|
|
+ add $2, bufptmp
|
|
|
|
+.Ldo_byte:
|
|
|
|
+ test $1, len_dw
|
|
|
|
+ jz .Ldone
|
|
|
|
+ crc32b (bufptmp), crc_init_dw
|
|
|
|
+.Ldone:
|
|
|
|
movq crc_init, %rax
|
|
|
|
popq %rsi
|
|
|
|
popq %rdi
|