refresh patches

2025-03-27 01:51:30 +03:00
parent 3d597650a9
commit b65c570ac2
239 changed files with 14214 additions and 9267 deletions
--- a/debian/patches/patchset-pf/crypto/0001-crypto-x86-aes-xts-make-the-fast-path-64-bit-specifi.patch
+++ b/debian/patches/patchset-pf/crypto/0001-crypto-x86-aes-xts-make-the-fast-path-64-bit-specifi.patch
@@ -0,0 +1,65 @@
+From 594316efc465f1408482e0d1dd379f4e3a6a5c7c Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Mon, 27 Jan 2025 13:16:09 -0800
+Subject: crypto: x86/aes-xts - make the fast path 64-bit specific
+
+Remove 32-bit support from the fast path in xts_crypt().  Then optimize
+it for 64-bit, and simplify the code, by switching to sg_virt() and
+removing the now-unnecessary checks for crossing a page boundary.
+
+The result is simpler code that is slightly smaller and faster in the
+case that actually matters (64-bit).
+
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+---
+ arch/x86/crypto/aesni-intel_glue.c | 30 ++++++++++--------------------
+ 1 file changed, 10 insertions(+), 20 deletions(-)
+
+--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
+@@ -581,11 +581,8 @@ xts_crypt(struct skcipher_request *req,
+ {
+ 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ 	const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
+-	const unsigned int cryptlen = req->cryptlen;
+-	struct scatterlist *src = req->src;
+-	struct scatterlist *dst = req->dst;
+ 
+-	if (unlikely(cryptlen < AES_BLOCK_SIZE))
+	if (unlikely(req->cryptlen < AES_BLOCK_SIZE))
+ 		return -EINVAL;
+ 
+ 	kernel_fpu_begin();
+@@ -593,23 +590,16 @@ xts_crypt(struct skcipher_request *req,
+ 
+ 	/*
+ 	 * In practice, virtually all XTS plaintexts and ciphertexts are either
+-	 * 512 or 4096 bytes, aligned such that they don't span page boundaries.
+-	 * To optimize the performance of these cases, and also any other case
+-	 * where no page boundary is spanned, the below fast-path handles
+-	 * single-page sources and destinations as efficiently as possible.
+	 * 512 or 4096 bytes and do not use multiple scatterlist elements.  To
+	 * optimize the performance of these cases, the below fast-path handles
+	 * single-scatterlist-element messages as efficiently as possible.  The
+	 * code is 64-bit specific, as it assumes no page mapping is needed.
+ 	 */
+-	if (likely(src->length >= cryptlen && dst->length >= cryptlen &&
+-		   src->offset + cryptlen <= PAGE_SIZE &&
+-		   dst->offset + cryptlen <= PAGE_SIZE)) {
+-		struct page *src_page = sg_page(src);
+-		struct page *dst_page = sg_page(dst);
+-		void *src_virt = kmap_local_page(src_page) + src->offset;
+-		void *dst_virt = kmap_local_page(dst_page) + dst->offset;
+-
+-		(*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen,
+-			      req->iv);
+-		kunmap_local(dst_virt);
+-		kunmap_local(src_virt);
+	if (IS_ENABLED(CONFIG_X86_64) &&
+	    likely(req->src->length >= req->cryptlen &&
+		   req->dst->length >= req->cryptlen)) {
+		(*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src),
+			      sg_virt(req->dst), req->cryptlen, req->iv);
+ 		kernel_fpu_end();
+ 		return 0;
+ 	}
--- a/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch
+++ b/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch
@@ -1,181 +0,0 @@
-From 0a957679a29a06fb2e3971615ff9f05f6becb941 Mon Sep 17 00:00:00 2001
-From: Eric Biggers <ebiggers@google.com>
-Date: Sun, 13 Oct 2024 21:06:49 -0700
-Subject: crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
-
-The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for
-lengths >= 512, due to the overhead of saving and restoring FPU state.
-Therefore, it is unnecessary for this code to be excessively "optimized"
-for lengths < 200.  Eliminate the excessive unrolling of this part of
-the code and use a more straightforward qword-at-a-time loop.
-
-Note: the part of the code in question is not entirely redundant, as it
-is still used to process any remainder mod 24, as well as any remaining
-data when fewer than 200 bytes remain after least one 3072-byte chunk.
-
-Signed-off-by: Eric Biggers <ebiggers@google.com>
---
- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++----------------
- 1 file changed, 33 insertions(+), 83 deletions(-)
-
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
-+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
-@@ -56,20 +56,10 @@
- .quad .Lcrc_\i
- .endm
- 
-.macro JNC_LESS_THAN j
-	jnc .Lless_than_\j
-.endm
-
-# Define threshold where buffers are considered "small" and routed to more
-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
-# SMALL_SIZE can be no larger than 255.
-
-+# Define threshold below which buffers are considered "small" and routed to
-+# regular CRC code that does not interleave the CRC instructions.
- #define SMALL_SIZE 200
- 
-.if (SMALL_SIZE > 255)
-.error "SMALL_ SIZE must be < 256"
-.endif
-
- # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
- 
- .text
-@@ -100,25 +90,18 @@ SYM_FUNC_START(crc_pcl)
- 	## Move crc_init for Linux to a different
- 	mov     crc_init_arg, crc_init
- 
-+	mov	%bufp, bufptmp		# rdi = *buf
-+	cmp	$SMALL_SIZE, len
-+	jb	.Lsmall
-+
- 	################################################################
- 	## 1) ALIGN:
- 	################################################################
-
-	mov     %bufp, bufptmp		# rdi = *buf
- 	neg     %bufp
- 	and     $7, %bufp		# calculate the unalignment amount of
- 					# the address
- 	je      .Lproc_block		# Skip if aligned
- 
-	## If len is less than 8 and we're unaligned, we need to jump
-	## to special code to avoid reading beyond the end of the buffer
-	cmp     $8, len
-	jae     .Ldo_align
-	# less_than_8 expects length in upper 3 bits of len_dw
-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
-	shl     $32-3+1, len_dw
-	jmp     .Lless_than_8_post_shl1
-
- .Ldo_align:
- 	#### Calculate CRC of unaligned bytes of the buffer (if any)
- 	movq    (bufptmp), tmp		# load a quadward from the buffer
-@@ -144,9 +127,6 @@ SYM_FUNC_START(crc_pcl)
- 	jae     .Lfull_block
- 
- .Lcontinue_block:
-	cmpq    $SMALL_SIZE, len
-	jb      .Lsmall
-
- 	## len < 128*24
- 	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
- 	mul     len_dw
-@@ -243,68 +223,38 @@ LABEL crc_ 0
- 	mov     tmp, len
- 	cmp     $128*24, tmp
- 	jae     .Lfull_block
-	cmp     $24, tmp
-+	cmp	$SMALL_SIZE, tmp
- 	jae     .Lcontinue_block
- 
-.Lless_than_24:
-	shl     $32-4, len_dw			# less_than_16 expects length
-						# in upper 4 bits of len_dw
-	jnc     .Lless_than_16
-	crc32q  (bufptmp), crc_init
-	crc32q  8(bufptmp), crc_init
-	jz      .Ldo_return
-	add     $16, bufptmp
-	# len is less than 8 if we got here
-	# less_than_8 expects length in upper 3 bits of len_dw
-	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
-	shl     $2, len_dw
-	jmp     .Lless_than_8_post_shl1
-
- 	#######################################################################
-	## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
-+	## 6) Process any remainder without interleaving:
- 	#######################################################################
- .Lsmall:
-	shl $32-8, len_dw		# Prepare len_dw for less_than_256
-	j=256
-.rept 5					# j = {256, 128, 64, 32, 16}
-.altmacro
-LABEL less_than_ %j			# less_than_j: Length should be in
-					# upper lg(j) bits of len_dw
-	j=(j/2)
-	shl     $1, len_dw		# Get next MSB
-	JNC_LESS_THAN %j
-.noaltmacro
-	i=0
-.rept (j/8)
-	crc32q  i(bufptmp), crc_init	# Compute crc32 of 8-byte data
-	i=i+8
-.endr
-	jz      .Ldo_return		# Return if remaining length is zero
-	add     $j, bufptmp		# Advance buf
-.endr
-
-.Lless_than_8:				# Length should be stored in
-					# upper 3 bits of len_dw
-	shl     $1, len_dw
-.Lless_than_8_post_shl1:
-	jnc     .Lless_than_4
-	crc32l  (bufptmp), crc_init_dw	# CRC of 4 bytes
-	jz      .Ldo_return		# return if remaining data is zero
-	add     $4, bufptmp
-.Lless_than_4:				# Length should be stored in
-					# upper 2 bits of len_dw
-	shl     $1, len_dw
-	jnc     .Lless_than_2
-	crc32w  (bufptmp), crc_init_dw	# CRC of 2 bytes
-	jz      .Ldo_return		# return if remaining data is zero
-	add     $2, bufptmp
-.Lless_than_2:				# Length should be stored in the MSB
-					# of len_dw
-	shl     $1, len_dw
-	jnc     .Lless_than_1
-	crc32b  (bufptmp), crc_init_dw	# CRC of 1 byte
-.Lless_than_1:				# Length should be zero
-.Ldo_return:
-+	test	len, len
-+	jz	.Ldone
-+	mov	len_dw, %eax
-+	shr	$3, %eax
-+	jz	.Ldo_dword
-+.Ldo_qwords:
-+	crc32q	(bufptmp), crc_init
-+	add	$8, bufptmp
-+	dec	%eax
-+	jnz	.Ldo_qwords
-+.Ldo_dword:
-+	test	$4, len_dw
-+	jz	.Ldo_word
-+	crc32l	(bufptmp), crc_init_dw
-+	add	$4, bufptmp
-+.Ldo_word:
-+	test	$2, len_dw
-+	jz	.Ldo_byte
-+	crc32w	(bufptmp), crc_init_dw
-+	add	$2, bufptmp
-+.Ldo_byte:
-+	test	$1, len_dw
-+	jz	.Ldone
-+	crc32b	(bufptmp), crc_init_dw
-+.Ldone:
- 	movq    crc_init, %rax
- 	popq    %rsi
- 	popq    %rdi
--- a/debian/patches/patchset-pf/crypto/0002-crypto-x86-aes-ctr-rewrite-AESNI-AVX-optimized-CTR-a.patch
+++ b/debian/patches/patchset-pf/crypto/0002-crypto-x86-aes-ctr-rewrite-AESNI-AVX-optimized-CTR-a.patch
--- a/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
+++ b/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
@@ -1,187 +0,0 @@
-From 3ed4205afe9305d71d055554ba27e7b8923865dc Mon Sep 17 00:00:00 2001
-From: Eric Biggers <ebiggers@google.com>
-Date: Sun, 13 Oct 2024 21:06:49 -0700
-Subject: crypto: x86/crc32c - access 32-bit arguments as 32-bit
-
-Fix crc32c-pcl-intel-asm_64.S to access 32-bit arguments as 32-bit
-values instead of 64-bit, since the upper bits of the corresponding
-64-bit registers are not guaranteed to be zero.  Also update the type of
-the length argument to be unsigned int rather than int, as the assembly
-code treats it as unsigned.
-
-Note: there haven't been any reports of this bug actually causing
-incorrect behavior.  Neither gcc nor clang guarantee zero-extension to
-64 bits, but zero-extension is likely to happen in practice because most
-instructions that operate on 32-bit registers zero-extend to 64 bits.
-
-Signed-off-by: Eric Biggers <ebiggers@google.com>
---
- arch/x86/crypto/crc32c-intel_glue.c       |  2 +-
- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 57 +++++++++++------------
- 2 files changed, 27 insertions(+), 32 deletions(-)
-
--- a/arch/x86/crypto/crc32c-intel_glue.c
-+++ b/arch/x86/crypto/crc32c-intel_glue.c
-@@ -41,7 +41,7 @@
-  */
- #define CRC32C_PCL_BREAKEVEN	512
- 
-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
-+asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
- 				unsigned int crc_init);
- #endif /* CONFIG_X86_64 */
- 
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
-+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
-@@ -60,7 +60,7 @@
- # regular CRC code that does not interleave the CRC instructions.
- #define SMALL_SIZE 200
- 
-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
-+# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
- 
- .text
- SYM_FUNC_START(crc_pcl)
-@@ -72,14 +72,11 @@ SYM_FUNC_START(crc_pcl)
- #define    block_0	%rcx
- #define    block_1	%rdx
- #define    block_2	%r11
-#define    len		%rsi
-#define    len_dw	%esi
-#define    len_w	%si
-#define    len_b	%sil
-#define    crc_init_arg %rdx
-+#define    len		%esi
-+#define    crc_init_arg %edx
- #define    tmp		%rbx
-#define    crc_init	%r8
-#define    crc_init_dw	%r8d
-+#define    crc_init	%r8d
-+#define    crc_init_q	%r8
- #define    crc1		%r9
- #define    crc2		%r10
- 
-@@ -107,9 +104,9 @@ SYM_FUNC_START(crc_pcl)
- 	movq    (bufptmp), tmp		# load a quadward from the buffer
- 	add     %bufp, bufptmp		# align buffer pointer for quadword
- 					# processing
-	sub     %bufp, len		# update buffer length
-+	sub	bufp_dw, len		# update buffer length
- .Lalign_loop:
-	crc32b  %bl, crc_init_dw 	# compute crc32 of 1-byte
-+	crc32b	%bl, crc_init		# compute crc32 of 1-byte
- 	shr     $8, tmp			# get next byte
- 	dec     %bufp
- 	jne     .Lalign_loop
-@@ -121,15 +118,14 @@ SYM_FUNC_START(crc_pcl)
- 	################################################################
- 
- 	## compute num of bytes to be processed
-	movq    len, tmp		# save num bytes in tmp
- 
-	cmpq    $128*24, len
-+	cmp	$128*24, len
- 	jae     .Lfull_block
- 
- .Lcontinue_block:
- 	## len < 128*24
- 	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
-	mul     len_dw
-+	mul	len
- 	shrq    $16, %rax
- 
- 	## eax contains floor(bytes / 24) = num 24-byte chunks to do
-@@ -176,7 +172,7 @@ SYM_FUNC_START(crc_pcl)
- LABEL crc_ %i
- .noaltmacro
- 	ENDBR
-	crc32q   -i*8(block_0), crc_init
-+	crc32q   -i*8(block_0), crc_init_q
- 	crc32q   -i*8(block_1), crc1
- 	crc32q   -i*8(block_2), crc2
- 	i=(i-1)
-@@ -186,7 +182,7 @@ LABEL crc_ %i
- LABEL crc_ %i
- .noaltmacro
- 	ENDBR
-	crc32q   -i*8(block_0), crc_init
-+	crc32q   -i*8(block_0), crc_init_q
- 	crc32q   -i*8(block_1), crc1
- # SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
- 
-@@ -200,9 +196,9 @@ LABEL crc_ %i
- 	shlq    $3, %rax			# rax *= 8
- 	pmovzxdq (%bufp,%rax), %xmm0		# 2 consts: K1:K2
- 	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
-	subq    %rax, tmp			# tmp -= rax*24
-+	sub	%eax, len			# len -= rax*24
- 
-	movq    crc_init, %xmm1			# CRC for block 1
-+	movq	crc_init_q, %xmm1		# CRC for block 1
- 	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
- 
- 	movq    crc1, %xmm2			# CRC for block 2
-@@ -211,8 +207,8 @@ LABEL crc_ %i
- 	pxor    %xmm2,%xmm1
- 	movq    %xmm1, %rax
- 	xor     -i*8(block_2), %rax
-	mov     crc2, crc_init
-	crc32   %rax, crc_init
-+	mov	crc2, crc_init_q
-+	crc32	%rax, crc_init_q
- 
- 	################################################################
- 	## 5) Check for end:
-@@ -220,10 +216,9 @@ LABEL crc_ %i
- 
- LABEL crc_ 0
- 	ENDBR
-	mov     tmp, len
-	cmp     $128*24, tmp
-+	cmp	$128*24, len
- 	jae     .Lfull_block
-	cmp	$SMALL_SIZE, tmp
-+	cmp	$SMALL_SIZE, len
- 	jae     .Lcontinue_block
- 
- 	#######################################################################
-@@ -232,30 +227,30 @@ LABEL crc_ 0
- .Lsmall:
- 	test	len, len
- 	jz	.Ldone
-	mov	len_dw, %eax
-+	mov	len, %eax
- 	shr	$3, %eax
- 	jz	.Ldo_dword
- .Ldo_qwords:
-	crc32q	(bufptmp), crc_init
-+	crc32q	(bufptmp), crc_init_q
- 	add	$8, bufptmp
- 	dec	%eax
- 	jnz	.Ldo_qwords
- .Ldo_dword:
-	test	$4, len_dw
-+	test	$4, len
- 	jz	.Ldo_word
-	crc32l	(bufptmp), crc_init_dw
-+	crc32l	(bufptmp), crc_init
- 	add	$4, bufptmp
- .Ldo_word:
-	test	$2, len_dw
-+	test	$2, len
- 	jz	.Ldo_byte
-	crc32w	(bufptmp), crc_init_dw
-+	crc32w	(bufptmp), crc_init
- 	add	$2, bufptmp
- .Ldo_byte:
-	test	$1, len_dw
-+	test	$1, len
- 	jz	.Ldone
-	crc32b	(bufptmp), crc_init_dw
-+	crc32b	(bufptmp), crc_init
- .Ldone:
-	movq    crc_init, %rax
-+	mov	crc_init, %eax
- 	popq    %rsi
- 	popq    %rdi
- 	popq    %rbx
--- a/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
+++ b/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
@@ -1,374 +0,0 @@
-From 5ffad9b234995f73548763a8487ecd256bba8d8d Mon Sep 17 00:00:00 2001
-From: Eric Biggers <ebiggers@google.com>
-Date: Sun, 13 Oct 2024 21:06:49 -0700
-Subject: crypto: x86/crc32c - eliminate jump table and excessive unrolling
-
-crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully
-unrolled and uses a jump table to jump into the correct location.  This
-optimization is misguided, as it bloats the binary code size and
-introduces an indirect call.  x86_64 CPUs can predict loops well, so it
-is fine to just use a loop instead.  Loop bookkeeping instructions can
-compete with the crc instructions for the ALUs, but this is easily
-mitigated by unrolling the loop by a smaller amount, such as 4 times.
-
-Therefore, re-roll the loop and make related tweaks to the code.
-
-This reduces the binary code size of crc_pclmul() from 4546 bytes to 418
-bytes, a 91% reduction.  In general it also makes the code faster, with
-some large improvements seen when retpoline is enabled.
-
-More detailed performance results are shown below.  They are given as
-percent improvement in throughput (negative means regressed) for CPU
-microarchitecture vs. input length in bytes.  E.g. an improvement from
-40 GB/s to 50 GB/s would be listed as 25%.
-
-Table 1: Results with retpoline enabled (the default):
-
-                       |   512 |   833 |  1024 |  2000 |  3173 |  4096 |
-  ---------------------+-------+-------+-------+------ +-------+-------+
-  Intel Haswell        | 35.0% | 20.7% | 17.8% |  9.7% | -0.2% |  4.4% |
-  Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% |  0.0% |  5.4% |
-  AMD Zen 2            | 29.5% | 17.2% | 13.5% |  8.6% | -0.5% |  2.8% |
-
-Table 2: Results with retpoline disabled:
-
-                       |   512 |   833 |  1024 |  2000 |  3173 |  4096 |
-  ---------------------+-------+-------+-------+------ +-------+-------+
-  Intel Haswell        |  3.3% |  4.8% |  4.5% |  0.9% | -2.9% |  0.3% |
-  Intel Emerald Rapids |  7.5% |  6.4% |  5.2% |  2.3% | -0.0% |  0.6% |
-  AMD Zen 2            | 11.8% |  1.4% |  0.2% |  1.3% | -0.9% | -0.2% |
-
-Signed-off-by: Eric Biggers <ebiggers@google.com>
---
- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 +++++++++-------------
- 1 file changed, 92 insertions(+), 141 deletions(-)
-
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
-+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
-@@ -7,6 +7,7 @@
-  * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
-  *
-  * Copyright (C) 2012 Intel Corporation.
-+ * Copyright 2024 Google LLC
-  *
-  * Authors:
-  *	Wajdi Feghali <wajdi.k.feghali@intel.com>
-@@ -44,18 +45,9 @@
-  */
- 
- #include <linux/linkage.h>
-#include <asm/nospec-branch.h>
- 
- ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
- 
-.macro LABEL prefix n
-.L\prefix\n\():
-.endm
-
-.macro JMPTBL_ENTRY i
-.quad .Lcrc_\i
-.endm
-
- # Define threshold below which buffers are considered "small" and routed to
- # regular CRC code that does not interleave the CRC instructions.
- #define SMALL_SIZE 200
-@@ -64,139 +56,116 @@
- 
- .text
- SYM_FUNC_START(crc_pcl)
-#define    bufp		rdi
-#define    bufp_dw	%edi
-#define    bufp_w	%di
-#define    bufp_b	%dil
-#define    bufptmp	%rcx
-#define    block_0	%rcx
-#define    block_1	%rdx
-#define    block_2	%r11
-#define    len		%esi
-#define    crc_init_arg %edx
-#define    tmp		%rbx
-#define    crc_init	%r8d
-#define    crc_init_q	%r8
-#define    crc1		%r9
-#define    crc2		%r10
-
-	pushq   %rbx
-	pushq   %rdi
-	pushq   %rsi
-
-	## Move crc_init for Linux to a different
-	mov     crc_init_arg, crc_init
-+#define    bufp		  %rdi
-+#define    bufp_d	  %edi
-+#define    len		  %esi
-+#define    crc_init	  %edx
-+#define    crc_init_q	  %rdx
-+#define    n_misaligned	  %ecx /* overlaps chunk_bytes! */
-+#define    n_misaligned_q %rcx
-+#define    chunk_bytes	  %ecx /* overlaps n_misaligned! */
-+#define    chunk_bytes_q  %rcx
-+#define    crc1		  %r8
-+#define    crc2		  %r9
- 
-	mov	%bufp, bufptmp		# rdi = *buf
- 	cmp	$SMALL_SIZE, len
- 	jb	.Lsmall
- 
- 	################################################################
- 	## 1) ALIGN:
- 	################################################################
-	neg     %bufp
-	and     $7, %bufp		# calculate the unalignment amount of
-+	mov	bufp_d, n_misaligned
-+	neg	n_misaligned
-+	and	$7, n_misaligned	# calculate the misalignment amount of
- 					# the address
-	je      .Lproc_block		# Skip if aligned
-+	je	.Laligned		# Skip if aligned
- 
-+	# Process 1 <= n_misaligned <= 7 bytes individually in order to align
-+	# the remaining data to an 8-byte boundary.
- .Ldo_align:
-	#### Calculate CRC of unaligned bytes of the buffer (if any)
-	movq    (bufptmp), tmp		# load a quadward from the buffer
-	add     %bufp, bufptmp		# align buffer pointer for quadword
-					# processing
-	sub	bufp_dw, len		# update buffer length
-+	movq	(bufp), %rax
-+	add	n_misaligned_q, bufp
-+	sub	n_misaligned, len
- .Lalign_loop:
-	crc32b	%bl, crc_init		# compute crc32 of 1-byte
-	shr     $8, tmp			# get next byte
-	dec     %bufp
-+	crc32b	%al, crc_init		# compute crc32 of 1-byte
-+	shr	$8, %rax		# get next byte
-+	dec	n_misaligned
- 	jne     .Lalign_loop
-
-.Lproc_block:
-+.Laligned:
- 
- 	################################################################
-	## 2) PROCESS  BLOCKS:
-+	## 2) PROCESS BLOCK:
- 	################################################################
- 
-	## compute num of bytes to be processed
-
- 	cmp	$128*24, len
- 	jae     .Lfull_block
- 
-.Lcontinue_block:
-	## len < 128*24
-	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
-	mul	len
-	shrq    $16, %rax
-
-	## eax contains floor(bytes / 24) = num 24-byte chunks to do
-
-	## process rax 24-byte chunks (128 >= rax >= 0)
-
-	## compute end address of each block
-	## block 0 (base addr + RAX * 8)
-	## block 1 (base addr + RAX * 16)
-	## block 2 (base addr + RAX * 24)
-	lea     (bufptmp, %rax, 8), block_0
-	lea     (block_0, %rax, 8), block_1
-	lea     (block_1, %rax, 8), block_2
-
-	xor     crc1, crc1
-	xor     crc2, crc2
-
-	## branch into array
-	leaq	jump_table(%rip), %bufp
-	mov	(%bufp,%rax,8), %bufp
-	JMP_NOSPEC bufp
-+.Lpartial_block:
-+	# Compute floor(len / 24) to get num qwords to process from each lane.
-+	imul	$2731, len, %eax	# 2731 = ceil(2^16 / 24)
-+	shr	$16, %eax
-+	jmp	.Lcrc_3lanes
- 
-	################################################################
-	## 2a) PROCESS FULL BLOCKS:
-	################################################################
- .Lfull_block:
-	movl    $128,%eax
-	lea     128*8*2(block_0), block_1
-	lea     128*8*3(block_0), block_2
-	add     $128*8*1, block_0
-
-	xor     crc1,crc1
-	xor     crc2,crc2
-
-	# Fall through into top of crc array (crc_128)
-+	# Processing 128 qwords from each lane.
-+	mov	$128, %eax
- 
- 	################################################################
-	## 3) CRC Array:
-+	## 3) CRC each of three lanes:
- 	################################################################
- 
-	i=128
-.rept 128-1
-.altmacro
-LABEL crc_ %i
-.noaltmacro
-	ENDBR
-	crc32q   -i*8(block_0), crc_init_q
-	crc32q   -i*8(block_1), crc1
-	crc32q   -i*8(block_2), crc2
-	i=(i-1)
-.endr
-
-.altmacro
-LABEL crc_ %i
-.noaltmacro
-	ENDBR
-	crc32q   -i*8(block_0), crc_init_q
-	crc32q   -i*8(block_1), crc1
-# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
-+.Lcrc_3lanes:
-+	xor	crc1,crc1
-+	xor     crc2,crc2
-+	mov	%eax, chunk_bytes
-+	shl	$3, chunk_bytes		# num bytes to process from each lane
-+	sub	$5, %eax		# 4 for 4x_loop, 1 for special last iter
-+	jl	.Lcrc_3lanes_4x_done
-+
-+	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
-+	# bookkeeping instructions, which can compete with crc32q for the ALUs.
-+.Lcrc_3lanes_4x_loop:
-+	crc32q	(bufp), crc_init_q
-+	crc32q	(bufp,chunk_bytes_q), crc1
-+	crc32q	(bufp,chunk_bytes_q,2), crc2
-+	crc32q	8(bufp), crc_init_q
-+	crc32q	8(bufp,chunk_bytes_q), crc1
-+	crc32q	8(bufp,chunk_bytes_q,2), crc2
-+	crc32q	16(bufp), crc_init_q
-+	crc32q	16(bufp,chunk_bytes_q), crc1
-+	crc32q	16(bufp,chunk_bytes_q,2), crc2
-+	crc32q	24(bufp), crc_init_q
-+	crc32q	24(bufp,chunk_bytes_q), crc1
-+	crc32q	24(bufp,chunk_bytes_q,2), crc2
-+	add	$32, bufp
-+	sub	$4, %eax
-+	jge	.Lcrc_3lanes_4x_loop
-+
-+.Lcrc_3lanes_4x_done:
-+	add	$4, %eax
-+	jz	.Lcrc_3lanes_last_qword
-+
-+.Lcrc_3lanes_1x_loop:
-+	crc32q	(bufp), crc_init_q
-+	crc32q	(bufp,chunk_bytes_q), crc1
-+	crc32q	(bufp,chunk_bytes_q,2), crc2
-+	add	$8, bufp
-+	dec	%eax
-+	jnz	.Lcrc_3lanes_1x_loop
- 
-	mov     block_2, block_0
-+.Lcrc_3lanes_last_qword:
-+	crc32q	(bufp), crc_init_q
-+	crc32q	(bufp,chunk_bytes_q), crc1
-+# SKIP  crc32q	(bufp,chunk_bytes_q,2), crc2	; Don't do this one yet
- 
- 	################################################################
- 	## 4) Combine three results:
- 	################################################################
- 
-	lea	(K_table-8)(%rip), %bufp		# first entry is for idx 1
-	shlq    $3, %rax			# rax *= 8
-	pmovzxdq (%bufp,%rax), %xmm0		# 2 consts: K1:K2
-	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
-	sub	%eax, len			# len -= rax*24
-+	lea	(K_table-8)(%rip), %rax		# first entry is for idx 1
-+	pmovzxdq (%rax,chunk_bytes_q), %xmm0	# 2 consts: K1:K2
-+	lea	(chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
-+	sub	%eax, len			# len -= chunk_bytes * 3
- 
- 	movq	crc_init_q, %xmm1		# CRC for block 1
- 	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
-@@ -206,20 +175,19 @@ LABEL crc_ %i
- 
- 	pxor    %xmm2,%xmm1
- 	movq    %xmm1, %rax
-	xor     -i*8(block_2), %rax
-+	xor	(bufp,chunk_bytes_q,2), %rax
- 	mov	crc2, crc_init_q
- 	crc32	%rax, crc_init_q
-+	lea	8(bufp,chunk_bytes_q,2), bufp
- 
- 	################################################################
-	## 5) Check for end:
-+	## 5) If more blocks remain, goto (2):
- 	################################################################
- 
-LABEL crc_ 0
-	ENDBR
- 	cmp	$128*24, len
-	jae     .Lfull_block
-+	jae	.Lfull_block
- 	cmp	$SMALL_SIZE, len
-	jae     .Lcontinue_block
-+	jae	.Lpartial_block
- 
- 	#######################################################################
- 	## 6) Process any remainder without interleaving:
-@@ -231,47 +199,30 @@ LABEL crc_ 0
- 	shr	$3, %eax
- 	jz	.Ldo_dword
- .Ldo_qwords:
-	crc32q	(bufptmp), crc_init_q
-	add	$8, bufptmp
-+	crc32q	(bufp), crc_init_q
-+	add	$8, bufp
- 	dec	%eax
- 	jnz	.Ldo_qwords
- .Ldo_dword:
- 	test	$4, len
- 	jz	.Ldo_word
-	crc32l	(bufptmp), crc_init
-	add	$4, bufptmp
-+	crc32l	(bufp), crc_init
-+	add	$4, bufp
- .Ldo_word:
- 	test	$2, len
- 	jz	.Ldo_byte
-	crc32w	(bufptmp), crc_init
-	add	$2, bufptmp
-+	crc32w	(bufp), crc_init
-+	add	$2, bufp
- .Ldo_byte:
- 	test	$1, len
- 	jz	.Ldone
-	crc32b	(bufptmp), crc_init
-+	crc32b	(bufp), crc_init
- .Ldone:
- 	mov	crc_init, %eax
-	popq    %rsi
-	popq    %rdi
-	popq    %rbx
-         RET
- SYM_FUNC_END(crc_pcl)
- 
- .section	.rodata, "a", @progbits
-        ################################################################
-        ## jump table        Table is 129 entries x 2 bytes each
-        ################################################################
-.align 4
-jump_table:
-	i=0
-.rept 129
-.altmacro
-JMPTBL_ENTRY %i
-.noaltmacro
-	i=i+1
-.endr
-
-
- 	################################################################
- 	## PCLMULQDQ tables
- 	## Table is 128 entries x 2 words (8 bytes) each