diff --git a/debian/bin/genpatch-pfkernel b/debian/bin/genpatch-pfkernel index 76fb33e..7e959bd 100755 --- a/debian/bin/genpatch-pfkernel +++ b/debian/bin/genpatch-pfkernel @@ -7,7 +7,7 @@ w=$(git rev-parse --path-format=absolute --show-toplevel) ; : "${w:?}" ; cd "$w" dst='debian/patches/pf-tmp' src='../linux-extras' -branches='btrfs cpuidle crypto fixes kbuild pksm xfs zstd' +branches='amd-pstate cpuidle crypto fixes kbuild zstd' if [ -d "${dst}" ] ; then rm -rf "${dst}" ; fi mkdir -p "${dst}" diff --git a/debian/changelog b/debian/changelog index ddacc80..c48510a 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,6 @@ linux (6.14-1) sid; urgency=medium * Sync with Debian. + * Refresh patches. -- Konstantin Demin <rockdrilla@gmail.com> Thu, 27 Mar 2025 01:51:03 +0300 diff --git a/debian/patches/bugfix/all/cpupower-fix-checks-for-cpu-existence.patch b/debian/patches/bugfix/all/cpupower-fix-checks-for-cpu-existence.patch index d8352b8..c9404db 100644 --- a/debian/patches/bugfix/all/cpupower-fix-checks-for-cpu-existence.patch +++ b/debian/patches/bugfix/all/cpupower-fix-checks-for-cpu-existence.patch @@ -24,7 +24,7 @@ negative cases.] --- --- a/tools/power/cpupower/bench/system.c +++ b/tools/power/cpupower/bench/system.c -@@ -58,12 +58,19 @@ long long int get_time() +@@ -45,12 +45,19 @@ long long int get_time() int set_cpufreq_governor(char *governor, unsigned int cpu) { diff --git a/debian/patches/bugfix/all/disable-some-marvell-phys.patch b/debian/patches/bugfix/all/disable-some-marvell-phys.patch index 80a1d58..af7bca8 100644 --- a/debian/patches/bugfix/all/disable-some-marvell-phys.patch +++ b/debian/patches/bugfix/all/disable-some-marvell-phys.patch @@ -16,7 +16,7 @@ correctness. --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c -@@ -1366,6 +1366,7 @@ static int m88e1118_config_init(struct p +@@ -1409,6 +1409,7 @@ static int m88e1118_config_init(struct p return genphy_soft_reset(phydev); } @@ -24,7 +24,7 @@ correctness. static int m88e1149_config_init(struct phy_device *phydev) { int err; -@@ -1391,7 +1392,9 @@ static int m88e1149_config_init(struct p +@@ -1434,7 +1435,9 @@ static int m88e1149_config_init(struct p return genphy_soft_reset(phydev); } @@ -34,7 +34,7 @@ correctness. static int m88e1145_config_init_rgmii(struct phy_device *phydev) { int err; -@@ -1469,6 +1472,7 @@ static int m88e1145_config_init(struct p +@@ -1512,6 +1515,7 @@ static int m88e1145_config_init(struct p return 0; } @@ -42,7 +42,7 @@ correctness. static int m88e1540_get_fld(struct phy_device *phydev, u8 *msecs) { -@@ -3790,6 +3794,7 @@ static struct phy_driver marvell_drivers +@@ -3848,6 +3852,7 @@ static struct phy_driver marvell_drivers .led_hw_control_set = m88e1318_led_hw_control_set, .led_hw_control_get = m88e1318_led_hw_control_get, }, @@ -50,7 +50,7 @@ correctness. { .phy_id = MARVELL_PHY_ID_88E1145, .phy_id_mask = MARVELL_PHY_ID_MASK, -@@ -3813,6 +3818,8 @@ static struct phy_driver marvell_drivers +@@ -3871,6 +3876,8 @@ static struct phy_driver marvell_drivers .cable_test_start = m88e1111_vct_cable_test_start, .cable_test_get_status = m88e1111_vct_cable_test_get_status, }, @@ -59,7 +59,7 @@ correctness. { .phy_id = MARVELL_PHY_ID_88E1149R, .phy_id_mask = MARVELL_PHY_ID_MASK, -@@ -3831,6 +3838,8 @@ static struct phy_driver marvell_drivers +@@ -3889,6 +3896,8 @@ static struct phy_driver marvell_drivers .get_strings = marvell_get_strings, .get_stats = marvell_get_stats, }, @@ -68,7 +68,7 @@ correctness. { .phy_id = MARVELL_PHY_ID_88E1240, .phy_id_mask = MARVELL_PHY_ID_MASK, -@@ -3851,6 +3860,7 @@ static struct phy_driver marvell_drivers +@@ -3909,6 +3918,7 @@ static struct phy_driver marvell_drivers .get_tunable = m88e1011_get_tunable, .set_tunable = m88e1011_set_tunable, }, @@ -76,7 +76,7 @@ correctness. { .phy_id = MARVELL_PHY_ID_88E1116R, .phy_id_mask = MARVELL_PHY_ID_MASK, -@@ -4139,9 +4149,9 @@ static struct mdio_device_id __maybe_unu +@@ -4197,9 +4207,9 @@ static const struct mdio_device_id __may { MARVELL_PHY_ID_88E1111_FINISAR, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E1118, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E1121R, MARVELL_PHY_ID_MASK }, diff --git a/debian/patches/bugfix/all/drm-amdkfd-Fix-user-queue-validation-on-Gfx7-8.patch b/debian/patches/bugfix/all/drm-amdkfd-Fix-user-queue-validation-on-Gfx7-8.patch deleted file mode 100644 index f1aa810..0000000 --- a/debian/patches/bugfix/all/drm-amdkfd-Fix-user-queue-validation-on-Gfx7-8.patch +++ /dev/null @@ -1,66 +0,0 @@ -From: Philip Yang <Philip.Yang@amd.com> -Date: Wed, 29 Jan 2025 12:37:30 -0500 -Subject: drm/amdkfd: Fix user queue validation on Gfx7/8 -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit -Origin: https://gitlab.freedesktop.org/drm/kernel/-/commit/e7a477735f1771b9a9346a5fbd09d7ff0641723a -Bug-Debian: https://bugs.debian.org/1093124 - -To workaround queue full h/w issue on Gfx7/8, when application create -AQL queue, the ring buffer bo allocate size is queue_size/2 and -map queue_size ring buffer to GPU in 2 pieces using 2 attachments, each -attachment map size is queue_size/2, with same ring_bo backing memory. - -For Gfx7/8, user queue buffer validation should use queue_size/2 to -verify ring_bo allocation and mapping size. - -Fixes: 68e599db7a54 ("drm/amdkfd: Validate user queue buffers") -Suggested-by: Tomáš Trnka <trnka@scm.com> -Signed-off-by: Philip Yang <Philip.Yang@amd.com> -Acked-by: Alex Deucher <alexander.deucher@amd.com> -Signed-off-by: Alex Deucher <alexander.deucher@amd.com> ---- - drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 12 +++++++++++- - 1 file changed, 11 insertions(+), 1 deletion(-) - -diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c -index ecccd7adbab4..62c635e9d1aa 100644 ---- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c -+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c -@@ -233,6 +233,7 @@ void kfd_queue_buffer_put(struct amdgpu_bo **bo) - int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) - { - struct kfd_topology_device *topo_dev; -+ u64 expected_queue_size; - struct amdgpu_vm *vm; - u32 total_cwsr_size; - int err; -@@ -241,6 +242,15 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope - if (!topo_dev) - return -EINVAL; - -+ /* AQL queues on GFX7 and GFX8 appear twice their actual size */ -+ if (properties->type == KFD_QUEUE_TYPE_COMPUTE && -+ properties->format == KFD_QUEUE_FORMAT_AQL && -+ topo_dev->node_props.gfx_target_version >= 70000 && -+ topo_dev->node_props.gfx_target_version < 90000) -+ expected_queue_size = properties->queue_size / 2; -+ else -+ expected_queue_size = properties->queue_size; -+ - vm = drm_priv_to_vm(pdd->drm_priv); - err = amdgpu_bo_reserve(vm->root.bo, false); - if (err) -@@ -255,7 +265,7 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope - goto out_err_unreserve; - - err = kfd_queue_buffer_get(vm, (void *)properties->queue_address, -- &properties->ring_bo, properties->queue_size); -+ &properties->ring_bo, expected_queue_size); - if (err) - goto out_err_unreserve; - --- -2.47.2 - diff --git a/debian/patches/bugfix/all/fs-add-module_softdep-declarations-for-hard-coded-cr.patch b/debian/patches/bugfix/all/fs-add-module_softdep-declarations-for-hard-coded-cr.patch index 7336dbb..5c841d6 100644 --- a/debian/patches/bugfix/all/fs-add-module_softdep-declarations-for-hard-coded-cr.patch +++ b/debian/patches/bugfix/all/fs-add-module_softdep-declarations-for-hard-coded-cr.patch @@ -18,7 +18,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c -@@ -2619,7 +2619,7 @@ module_exit(exit_btrfs_fs) +@@ -2627,7 +2627,7 @@ module_exit(exit_btrfs_fs) MODULE_DESCRIPTION("B-Tree File System (BTRFS)"); MODULE_LICENSE("GPL"); @@ -27,19 +27,9 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> MODULE_SOFTDEP("pre: xxhash64"); MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: blake2b-256"); ---- a/fs/ext4/super.c -+++ b/fs/ext4/super.c -@@ -7404,6 +7404,6 @@ static void __exit ext4_exit_fs(void) - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Fourth Extended Filesystem"); - MODULE_LICENSE("GPL"); --MODULE_SOFTDEP("pre: crc32c"); -+MODULE_SOFTDEP("pre: crypto-crc32c"); - module_init(ext4_init_fs) - module_exit(ext4_exit_fs) --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c -@@ -3194,6 +3194,7 @@ static void __exit journal_exit(void) +@@ -3152,6 +3152,7 @@ static void __exit journal_exit(void) MODULE_DESCRIPTION("Generic filesystem journal-writing module"); MODULE_LICENSE("GPL"); @@ -49,7 +39,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c -@@ -2353,5 +2353,8 @@ static void __exit exit_nfsd(void) +@@ -2344,5 +2344,8 @@ static void __exit exit_nfsd(void) MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("In-kernel NFS server"); MODULE_LICENSE("GPL"); diff --git a/debian/patches/bugfix/all/kbuild-fix-recordmcount-dependency.patch b/debian/patches/bugfix/all/kbuild-fix-recordmcount-dependency.patch index 8f9c21f..28ea6d3 100644 --- a/debian/patches/bugfix/all/kbuild-fix-recordmcount-dependency.patch +++ b/debian/patches/bugfix/all/kbuild-fix-recordmcount-dependency.patch @@ -9,7 +9,7 @@ sources. --- a/scripts/Makefile.build +++ b/scripts/Makefile.build -@@ -210,6 +210,11 @@ cmd_record_mcount = $(if $(findstring $( +@@ -188,6 +188,11 @@ cmd_record_mcount = $(if $(findstring $( $(sub_cmd_record_mcount)) endif # CONFIG_FTRACE_MCOUNT_USE_RECORDMCOUNT diff --git a/debian/patches/bugfix/all/libapi-define-_fortify_source-as-2-not-empty.patch b/debian/patches/bugfix/all/libapi-define-_fortify_source-as-2-not-empty.patch index 0412b83..e50d716 100644 --- a/debian/patches/bugfix/all/libapi-define-_fortify_source-as-2-not-empty.patch +++ b/debian/patches/bugfix/all/libapi-define-_fortify_source-as-2-not-empty.patch @@ -7,7 +7,7 @@ Signed-off-by: Ben Hutchings <benh@debian.org> --- --- a/tools/lib/api/Makefile +++ b/tools/lib/api/Makefile -@@ -29,7 +29,7 @@ endif +@@ -35,7 +35,7 @@ ifeq ($(DEBUG),0) endif ifeq ($(DEBUG),0) diff --git a/debian/patches/bugfix/all/module-disable-matching-missing-version-crc.patch b/debian/patches/bugfix/all/module-disable-matching-missing-version-crc.patch index 8e0a7ee..5ef931e 100644 --- a/debian/patches/bugfix/all/module-disable-matching-missing-version-crc.patch +++ b/debian/patches/bugfix/all/module-disable-matching-missing-version-crc.patch @@ -9,7 +9,7 @@ alternative may allow subverting module signing. --- --- a/kernel/module/version.c +++ b/kernel/module/version.c -@@ -46,9 +46,8 @@ int check_version(const struct load_info +@@ -63,9 +63,8 @@ int check_version(const struct load_info goto bad_version; } diff --git a/debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch b/debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch deleted file mode 100644 index 04b49b8..0000000 --- a/debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch +++ /dev/null @@ -1,37 +0,0 @@ -From: Scott Mayhew <smayhew@redhat.com> -Date: Tue, 10 Dec 2024 07:25:54 -0500 -Subject: nfsd: fix legacy client tracking initialization -Origin: https://git.kernel.org/linus/de71d4e211eddb670b285a0ea477a299601ce1ca - -Get rid of the nfsd4_legacy_tracking_ops->init() call in -check_for_legacy_methods(). That will be handled in the caller -(nfsd4_client_tracking_init()). Otherwise, we'll wind up calling -nfsd4_legacy_tracking_ops->init() twice, and the second time we'll -trigger the BUG_ON() in nfsd4_init_recdir(). - -Fixes: 74fd48739d04 ("nfsd: new Kconfig option for legacy client tracking") -Reported-by: Jur van der Burg <jur@avtware.com> -Link: https://bugzilla.kernel.org/show_bug.cgi?id=219580 -Signed-off-by: Scott Mayhew <smayhew@redhat.com> -Reviewed-by: Jeff Layton <jlayton@kernel.org> -Tested-by: Salvatore Bonaccorso <carnil@debian.org> -Signed-off-by: Chuck Lever <chuck.lever@oracle.com> ---- - fs/nfsd/nfs4recover.c | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c -index 4a765555bf84..1c8fcb04b3cd 100644 ---- a/fs/nfsd/nfs4recover.c -+++ b/fs/nfsd/nfs4recover.c -@@ -2052,7 +2052,6 @@ static inline int check_for_legacy_methods(int status, struct net *net) - path_put(&path); - if (status) - return -ENOTDIR; -- status = nn->client_tracking_ops->init(net); - } - return status; - } --- -2.47.2 - diff --git a/debian/patches/bugfix/all/perf-docs-Fix-perf-check-manual-page-built-with-asci.patch b/debian/patches/bugfix/all/perf-docs-Fix-perf-check-manual-page-built-with-asci.patch index a6cc3e5..48f4e3f 100644 --- a/debian/patches/bugfix/all/perf-docs-Fix-perf-check-manual-page-built-with-asci.patch +++ b/debian/patches/bugfix/all/perf-docs-Fix-perf-check-manual-page-built-with-asci.patch @@ -20,8 +20,6 @@ Signed-off-by: Ben Hutchings <benh@debian.org> tools/perf/Documentation/perf-check.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -diff --git a/tools/perf/Documentation/perf-check.txt b/tools/perf/Documentation/perf-check.txt -index a764a4629220..80eb1de4eee0 100644 --- a/tools/perf/Documentation/perf-check.txt +++ b/tools/perf/Documentation/perf-check.txt @@ -1,5 +1,5 @@ diff --git a/debian/patches/bugfix/all/perf-tools-pass-extra_cflags-through-to-libbpf-build-again.patch b/debian/patches/bugfix/all/perf-tools-pass-extra_cflags-through-to-libbpf-build-again.patch index 3153c71..ba1f741 100644 --- a/debian/patches/bugfix/all/perf-tools-pass-extra_cflags-through-to-libbpf-build-again.patch +++ b/debian/patches/bugfix/all/perf-tools-pass-extra_cflags-through-to-libbpf-build-again.patch @@ -16,7 +16,7 @@ Signed-off-by: Ben Hutchings <benh@debian.org> --- --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf -@@ -951,7 +951,7 @@ $(LIBAPI)-clean: +@@ -945,7 +945,7 @@ $(LIBAPI)-clean: $(LIBBPF): FORCE | $(LIBBPF_OUTPUT) $(Q)$(MAKE) -C $(LIBBPF_DIR) FEATURES_DUMP=$(FEATURE_DUMP_EXPORT) \ O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= subdir= \ diff --git a/debian/patches/bugfix/all/tools-build-remove-bpf-run-time-check-at-build-time.patch b/debian/patches/bugfix/all/tools-build-remove-bpf-run-time-check-at-build-time.patch index 3818dbb..3c5e7e7 100644 --- a/debian/patches/bugfix/all/tools-build-remove-bpf-run-time-check-at-build-time.patch +++ b/debian/patches/bugfix/all/tools-build-remove-bpf-run-time-check-at-build-time.patch @@ -11,7 +11,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- --- a/tools/build/feature/test-bpf.c +++ b/tools/build/feature/test-bpf.c -@@ -35,8 +35,10 @@ int main(void) +@@ -41,8 +41,10 @@ int main(void) attr.prog_flags = 0; /* diff --git a/debian/patches/bugfix/all/tools-perf-fix-missing-ldflags-for-some-programs.patch b/debian/patches/bugfix/all/tools-perf-fix-missing-ldflags-for-some-programs.patch index 2d8f29d..3bec90c 100644 --- a/debian/patches/bugfix/all/tools-perf-fix-missing-ldflags-for-some-programs.patch +++ b/debian/patches/bugfix/all/tools-perf-fix-missing-ldflags-for-some-programs.patch @@ -6,7 +6,7 @@ Signed-off-by: Ben Hutchings <benh@debian.org> --- --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf -@@ -785,7 +785,7 @@ $(OUTPUT)dlfilters/%.o: dlfilters/%.c in +@@ -919,7 +919,7 @@ $(OUTPUT)dlfilters/%.o: dlfilters/%.c in .SECONDARY: $(DLFILTERS:.so=.o) $(OUTPUT)dlfilters/%.so: $(OUTPUT)dlfilters/%.o diff --git a/debian/patches/bugfix/all/tools_lib_symbol_use_d_fortify_source_2_for_non_debug_builds.patch b/debian/patches/bugfix/all/tools_lib_symbol_use_d_fortify_source_2_for_non_debug_builds.patch index 9662520..c3e3817 100644 --- a/debian/patches/bugfix/all/tools_lib_symbol_use_d_fortify_source_2_for_non_debug_builds.patch +++ b/debian/patches/bugfix/all/tools_lib_symbol_use_d_fortify_source_2_for_non_debug_builds.patch @@ -14,11 +14,9 @@ Acked-by: Ian Rogers <irogers@google.com> tools/lib/symbol/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -diff --git a/tools/lib/symbol/Makefile b/tools/lib/symbol/Makefile -index 13d43c6f92b4..4a08cc4e19f2 100644 --- a/tools/lib/symbol/Makefile +++ b/tools/lib/symbol/Makefile -@@ -39,7 +39,7 @@ endif +@@ -35,7 +35,7 @@ ifeq ($(DEBUG),0) endif ifeq ($(DEBUG),0) @@ -27,5 +25,3 @@ index 13d43c6f92b4..4a08cc4e19f2 100644 endif # Treat warnings as errors unless directed not to --- -2.39.2 diff --git a/debian/patches/bugfix/all/usbip-document-tcp-wrappers.patch b/debian/patches/bugfix/all/usbip-document-tcp-wrappers.patch index d9d7b30..efb493f 100644 --- a/debian/patches/bugfix/all/usbip-document-tcp-wrappers.patch +++ b/debian/patches/bugfix/all/usbip-document-tcp-wrappers.patch @@ -17,7 +17,7 @@ Add references to TCP wrappers configuration in the manual page. .SH OPTIONS .HP -@@ -69,7 +70,8 @@ Show version. +@@ -75,7 +76,8 @@ Show version. .B usbipd offers no authentication or authorization for USB/IP. Any diff --git a/debian/patches/bugfix/arm/arm-dts-kirkwood-fix-sata-pinmux-ing-for-ts419.patch b/debian/patches/bugfix/arm/arm-dts-kirkwood-fix-sata-pinmux-ing-for-ts419.patch index 52acb9f..a113627 100644 --- a/debian/patches/bugfix/arm/arm-dts-kirkwood-fix-sata-pinmux-ing-for-ts419.patch +++ b/debian/patches/bugfix/arm/arm-dts-kirkwood-fix-sata-pinmux-ing-for-ts419.patch @@ -21,7 +21,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- a/arch/arm/boot/dts/marvell/kirkwood-ts419.dtsi +++ b/arch/arm/boot/dts/marvell/kirkwood-ts419.dtsi -@@ -69,3 +69,11 @@ +@@ -67,3 +67,11 @@ phy-handle = <ðphy1>; }; }; diff --git a/debian/patches/bugfix/arm/arm-mm-export-__sync_icache_dcache-for-xen-privcmd.patch b/debian/patches/bugfix/arm/arm-mm-export-__sync_icache_dcache-for-xen-privcmd.patch index cd5b36f..a3a198c 100644 --- a/debian/patches/bugfix/arm/arm-mm-export-__sync_icache_dcache-for-xen-privcmd.patch +++ b/debian/patches/bugfix/arm/arm-mm-export-__sync_icache_dcache-for-xen-privcmd.patch @@ -17,11 +17,9 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> arch/arm/mm/flush.c | 1 + 1 file changed, 1 insertion(+) -Index: debian-kernel/arch/arm/mm/flush.c -=================================================================== ---- debian-kernel.orig/arch/arm/mm/flush.c -+++ debian-kernel/arch/arm/mm/flush.c -@@ -292,6 +292,7 @@ void __sync_icache_dcache(pte_t pteval) +--- a/arch/arm/mm/flush.c ++++ b/arch/arm/mm/flush.c +@@ -310,6 +310,7 @@ void __sync_icache_dcache(pte_t pteval) if (pte_exec(pteval)) __flush_icache_all(); } diff --git a/debian/patches/bugfix/powerpc/powerpc-boot-fix-missing-crc32poly.h-when-building-with-kernel_xz.patch b/debian/patches/bugfix/powerpc/powerpc-boot-fix-missing-crc32poly.h-when-building-with-kernel_xz.patch index a00a2a4..5581261 100644 --- a/debian/patches/bugfix/powerpc/powerpc-boot-fix-missing-crc32poly.h-when-building-with-kernel_xz.patch +++ b/debian/patches/bugfix/powerpc/powerpc-boot-fix-missing-crc32poly.h-when-building-with-kernel_xz.patch @@ -24,11 +24,9 @@ Tested-by: Michal Kubecek <mkubecek@suse.cz> arch/powerpc/boot/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -Index: linux/arch/powerpc/boot/Makefile -=================================================================== ---- linux.orig/arch/powerpc/boot/Makefile -+++ linux/arch/powerpc/boot/Makefile -@@ -70,7 +70,7 @@ BOOTCFLAGS += -fno-stack-protector +--- a/arch/powerpc/boot/Makefile ++++ b/arch/powerpc/boot/Makefile +@@ -97,7 +97,7 @@ BOOTCFLAGS += -fno-stack-protector endif BOOTCFLAGS += -include $(srctree)/include/linux/compiler_attributes.h diff --git a/debian/patches/bugfix/sh/sh-boot-do-not-use-hyphen-in-exported-variable-name.patch b/debian/patches/bugfix/sh/sh-boot-do-not-use-hyphen-in-exported-variable-name.patch index 31eb2eb..9a30440 100644 --- a/debian/patches/bugfix/sh/sh-boot-do-not-use-hyphen-in-exported-variable-name.patch +++ b/debian/patches/bugfix/sh/sh-boot-do-not-use-hyphen-in-exported-variable-name.patch @@ -23,10 +23,8 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> arch/sh/boot/romimage/Makefile | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) -Index: linux/arch/sh/Makefile -=================================================================== ---- linux.orig/arch/sh/Makefile -+++ linux/arch/sh/Makefile +--- a/arch/sh/Makefile ++++ b/arch/sh/Makefile @@ -102,16 +102,16 @@ UTS_MACHINE := sh LDFLAGS_vmlinux += -e _stext @@ -49,11 +47,9 @@ Index: linux/arch/sh/Makefile # Mach groups machdir-$(CONFIG_SOLUTION_ENGINE) += mach-se -Index: linux/arch/sh/boot/compressed/Makefile -=================================================================== ---- linux.orig/arch/sh/boot/compressed/Makefile -+++ linux/arch/sh/boot/compressed/Makefile -@@ -36,7 +36,7 @@ endif +--- a/arch/sh/boot/compressed/Makefile ++++ b/arch/sh/boot/compressed/Makefile +@@ -27,7 +27,7 @@ endif ccflags-remove-$(CONFIG_MCOUNT) += -pg @@ -62,7 +58,7 @@ Index: linux/arch/sh/boot/compressed/Makefile -T $(obj)/../../kernel/vmlinux.lds KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING -@@ -60,7 +60,7 @@ $(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.b +@@ -51,7 +51,7 @@ $(obj)/vmlinux.bin.lzo: $(obj)/vmlinux.b OBJCOPYFLAGS += -R .empty_zero_page @@ -71,10 +67,8 @@ Index: linux/arch/sh/boot/compressed/Makefile $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE $(call if_changed,ld) -Index: linux/arch/sh/boot/romimage/Makefile -=================================================================== ---- linux.orig/arch/sh/boot/romimage/Makefile -+++ linux/arch/sh/boot/romimage/Makefile +--- a/arch/sh/boot/romimage/Makefile ++++ b/arch/sh/boot/romimage/Makefile @@ -13,7 +13,7 @@ mmcif-obj-$(CONFIG_CPU_SUBTYPE_SH7724) : load-$(CONFIG_ROMIMAGE_MMCIF) := $(mmcif-load-y) obj-$(CONFIG_ROMIMAGE_MMCIF) := $(mmcif-obj-y) diff --git a/debian/patches/bugfix/x86/perf-tools-fix-unwind-build-on-i386.patch b/debian/patches/bugfix/x86/perf-tools-fix-unwind-build-on-i386.patch index f9baa7b..07410f8 100644 --- a/debian/patches/bugfix/x86/perf-tools-fix-unwind-build-on-i386.patch +++ b/debian/patches/bugfix/x86/perf-tools-fix-unwind-build-on-i386.patch @@ -15,7 +15,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- a/tools/perf/arch/x86/util/unwind-libunwind.c +++ b/tools/perf/arch/x86/util/unwind-libunwind.c -@@ -66,7 +66,7 @@ int LIBUNWIND__ARCH_REG_ID(int regnum) +@@ -67,7 +67,7 @@ int LIBUNWIND__ARCH_REG_ID(int regnum) break; default: pr_err("unwind: invalid reg id %d\n", regnum); @@ -24,7 +24,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> } return id; -@@ -106,7 +106,7 @@ int LIBUNWIND__ARCH_REG_ID(int regnum) +@@ -107,7 +107,7 @@ int LIBUNWIND__ARCH_REG_ID(int regnum) break; default: pr_err("unwind: invalid reg id %d\n", regnum); diff --git a/debian/patches/bugfix/x86/viafb-autoload-on-olpc-xo1.5-only.patch b/debian/patches/bugfix/x86/viafb-autoload-on-olpc-xo1.5-only.patch index 62b4eb7..a30414d 100644 --- a/debian/patches/bugfix/x86/viafb-autoload-on-olpc-xo1.5-only.patch +++ b/debian/patches/bugfix/x86/viafb-autoload-on-olpc-xo1.5-only.patch @@ -16,7 +16,7 @@ un-blacklist it in udev. --- --- a/drivers/video/fbdev/via/via-core.c +++ b/drivers/video/fbdev/via/via-core.c -@@ -695,7 +695,14 @@ static const struct pci_device_id via_pc +@@ -700,7 +700,14 @@ static const struct pci_device_id via_pc .driver_data = UNICHROME_VX900 }, { } }; diff --git a/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch b/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch index df5268c..51074c7 100644 --- a/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch +++ b/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch @@ -21,7 +21,7 @@ Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com> --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -118,6 +118,12 @@ +@@ -119,6 +119,12 @@ #include <kunit/visibility.h> @@ -34,7 +34,7 @@ Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com> /* * Minimum number of threads to boot the kernel */ -@@ -2138,6 +2144,10 @@ __latent_entropy struct task_struct *cop +@@ -2167,6 +2173,10 @@ __latent_entropy struct task_struct *cop if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -45,7 +45,7 @@ Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com> /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -3287,6 +3297,12 @@ int ksys_unshare(unsigned long unshare_f +@@ -3320,6 +3330,12 @@ int ksys_unshare(unsigned long unshare_f if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -71,7 +71,7 @@ Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com> #endif /* CONFIG_SYSCTL */ /* -@@ -1618,6 +1622,15 @@ static struct ctl_table kern_table[] = { +@@ -1617,6 +1621,15 @@ static const struct ctl_table kern_table .mode = 0644, .proc_handler = proc_dointvec, }, diff --git a/debian/patches/debian/af_802154-Disable-auto-loading-as-mitigation-against.patch b/debian/patches/debian/af_802154-Disable-auto-loading-as-mitigation-against.patch index 7c530cb..01f3620 100644 --- a/debian/patches/debian/af_802154-Disable-auto-loading-as-mitigation-against.patch +++ b/debian/patches/debian/af_802154-Disable-auto-loading-as-mitigation-against.patch @@ -21,7 +21,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c -@@ -1138,4 +1138,4 @@ module_init(af_ieee802154_init); +@@ -1140,4 +1140,4 @@ module_exit(af_ieee802154_remove); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IEEE 802.15.4 socket interface"); diff --git a/debian/patches/debian/arch-sh4-fix-uimage-build.patch b/debian/patches/debian/arch-sh4-fix-uimage-build.patch index 89e626b..3c7200d 100644 --- a/debian/patches/debian/arch-sh4-fix-uimage-build.patch +++ b/debian/patches/debian/arch-sh4-fix-uimage-build.patch @@ -9,11 +9,9 @@ Forwarded: not-needed arch/sh/Makefile | 1 - 1 file changed, 1 deletion(-) -diff --git a/arch/sh/Makefile b/arch/sh/Makefile -index da9cf952f33c..974bbd9dcfcf 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile -@@ -85,7 +85,6 @@ OBJCOPYFLAGS := -O binary -R .note -R .note.gnu.build-id -R .comment \ +@@ -81,7 +81,6 @@ OBJCOPYFLAGS := -O binary -R .note -R .n # Give the various platforms the opportunity to set default image types defaultimage-y := zImage @@ -21,6 +19,3 @@ index da9cf952f33c..974bbd9dcfcf 100644 defaultimage-$(CONFIG_SH_RSK) := uImage defaultimage-$(CONFIG_SH_URQUELL) := uImage defaultimage-$(CONFIG_SH_MIGOR) := uImage --- -2.27.0 - diff --git a/debian/patches/debian/btrfs-warn-about-raid5-6-being-experimental-at-mount.patch b/debian/patches/debian/btrfs-warn-about-raid5-6-being-experimental-at-mount.patch index a9a3ce9..7d96c3b 100644 --- a/debian/patches/debian/btrfs-warn-about-raid5-6-being-experimental-at-mount.patch +++ b/debian/patches/debian/btrfs-warn-about-raid5-6-being-experimental-at-mount.patch @@ -20,15 +20,12 @@ implementation went from disk-io.c to super.c; forwarded the issue] fs/btrfs/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) -diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c -index 101f786963d4..2c409bce1bf5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c -@@ -731,6 +731,18 @@ static void set_device_specific_options(struct btrfs_fs_info *fs_info) - !fs_info->fs_devices->rotating) +@@ -765,6 +765,18 @@ static void set_device_specific_options( btrfs_set_opt(fs_info->mount_opt, SSD); -+ /* + /* + * Warn about RAID5/6 being experimental at mount time + */ + if ((fs_info->avail_data_alloc_bits | @@ -40,6 +37,7 @@ index 101f786963d4..2c409bce1bf5 100644 + add_taint(TAINT_AUX, LOCKDEP_STILL_OK); + } + - /* ++ /* * For devices supporting discard turn on discard=async automatically, * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. diff --git a/debian/patches/debian/dccp-disable-auto-loading-as-mitigation-against-local-exploits.patch b/debian/patches/debian/dccp-disable-auto-loading-as-mitigation-against-local-exploits.patch index 8358318..9202fcb 100644 --- a/debian/patches/debian/dccp-disable-auto-loading-as-mitigation-against-local-exploits.patch +++ b/debian/patches/debian/dccp-disable-auto-loading-as-mitigation-against-local-exploits.patch @@ -15,7 +15,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c -@@ -1071,8 +1071,8 @@ module_exit(dccp_v4_exit); +@@ -1099,8 +1099,8 @@ module_exit(dccp_v4_exit); * values directly, Also cover the case where the protocol is not specified, * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP */ @@ -28,7 +28,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c -@@ -1125,8 +1125,8 @@ module_exit(dccp_v6_exit); +@@ -1174,8 +1174,8 @@ module_exit(dccp_v6_exit); * values directly, Also cover the case where the protocol is not specified, * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP */ diff --git a/debian/patches/debian/dfsg/arch-powerpc-platforms-8xx-ucode-disable.patch b/debian/patches/debian/dfsg/arch-powerpc-platforms-8xx-ucode-disable.patch index 0e51769..b9dca1d 100644 --- a/debian/patches/debian/dfsg/arch-powerpc-platforms-8xx-ucode-disable.patch +++ b/debian/patches/debian/dfsg/arch-powerpc-platforms-8xx-ucode-disable.patch @@ -3,11 +3,9 @@ Date: Mon, 13 Apr 2009 17:34:00 +0100 Subject: Remove microcode patches for mgsuvd (not enabled in Debian configs) Forwarded: not-needed -diff --git a/arch/powerpc/platforms/8xx/Kconfig b/arch/powerpc/platforms/8xx/Kconfig -index 48a920a..81570b6 100644 --- a/arch/powerpc/platforms/8xx/Kconfig +++ b/arch/powerpc/platforms/8xx/Kconfig -@@ -160,16 +160,19 @@ config NO_UCODE_PATCH +@@ -136,16 +136,19 @@ config NO_UCODE_PATCH config USB_SOF_UCODE_PATCH bool "USB SOF patch" diff --git a/debian/patches/debian/dfsg/drivers-media-dvb-dvb-usb-af9005-disable.patch b/debian/patches/debian/dfsg/drivers-media-dvb-dvb-usb-af9005-disable.patch index 4e42da9..1cc5a29 100644 --- a/debian/patches/debian/dfsg/drivers-media-dvb-dvb-usb-af9005-disable.patch +++ b/debian/patches/debian/dfsg/drivers-media-dvb-dvb-usb-af9005-disable.patch @@ -5,7 +5,7 @@ Forwarded: not-needed --- a/drivers/media/usb/dvb-usb/Kconfig +++ b/drivers/media/usb/dvb-usb/Kconfig -@@ -227,6 +227,7 @@ config DVB_USB_OPERA1 +@@ -35,6 +35,7 @@ config DVB_USB_A800 config DVB_USB_AF9005 tristate "Afatech AF9005 DVB-T USB1.1 support" diff --git a/debian/patches/debian/export-symbols-needed-by-android-drivers.patch b/debian/patches/debian/export-symbols-needed-by-android-drivers.patch index 8887113..aae7908 100644 --- a/debian/patches/debian/export-symbols-needed-by-android-drivers.patch +++ b/debian/patches/debian/export-symbols-needed-by-android-drivers.patch @@ -22,7 +22,7 @@ Export the currently un-exported symbols it depends on. --- a/fs/file.c +++ b/fs/file.c -@@ -823,6 +823,7 @@ struct file *file_close_fd(unsigned int +@@ -837,6 +837,7 @@ struct file *file_close_fd(unsigned int return file; } @@ -42,7 +42,7 @@ Export the currently un-exported symbols it depends on. struct msg_msgseg *next; --- a/ipc/namespace.c +++ b/ipc/namespace.c -@@ -205,6 +205,7 @@ void put_ipc_ns(struct ipc_namespace *ns +@@ -207,6 +207,7 @@ void put_ipc_ns(struct ipc_namespace *ns schedule_work(&free_ipc_work); } } @@ -72,7 +72,7 @@ Export the currently un-exported symbols it depends on. * Note: we use "set_current_state()" _after_ the wait-queue add, --- a/kernel/task_work.c +++ b/kernel/task_work.c -@@ -97,6 +97,7 @@ int task_work_add(struct task_struct *ta +@@ -96,6 +96,7 @@ int task_work_add(struct task_struct *ta return 0; } @@ -82,7 +82,7 @@ Export the currently un-exported symbols it depends on. * task_work_cancel_match - cancel a pending work added by task_work_add() --- a/mm/memory.c +++ b/mm/memory.c -@@ -1934,6 +1934,7 @@ void zap_page_range_single(struct vm_are +@@ -2030,6 +2030,7 @@ void zap_page_range_single(struct vm_are tlb_finish_mmu(&tlb); hugetlb_zap_end(vma, details); } @@ -92,7 +92,7 @@ Export the currently un-exported symbols it depends on. * zap_vma_ptes - remove ptes mapping the vma --- a/security/security.c +++ b/security/security.c -@@ -890,6 +890,7 @@ int security_binder_set_context_mgr(cons +@@ -996,6 +996,7 @@ int security_binder_set_context_mgr(cons { return call_int_hook(binder_set_context_mgr, mgr); } @@ -100,7 +100,7 @@ Export the currently un-exported symbols it depends on. /** * security_binder_transaction() - Check if a binder transaction is allowed -@@ -905,6 +906,7 @@ int security_binder_transaction(const st +@@ -1011,6 +1012,7 @@ int security_binder_transaction(const st { return call_int_hook(binder_transaction, from, to); } @@ -108,7 +108,7 @@ Export the currently un-exported symbols it depends on. /** * security_binder_transfer_binder() - Check if a binder transfer is allowed -@@ -920,6 +922,7 @@ int security_binder_transfer_binder(cons +@@ -1026,6 +1028,7 @@ int security_binder_transfer_binder(cons { return call_int_hook(binder_transfer_binder, from, to); } @@ -116,7 +116,7 @@ Export the currently un-exported symbols it depends on. /** * security_binder_transfer_file() - Check if a binder file xfer is allowed -@@ -936,6 +939,7 @@ int security_binder_transfer_file(const +@@ -1042,6 +1045,7 @@ int security_binder_transfer_file(const { return call_int_hook(binder_transfer_file, from, to, file); } diff --git a/debian/patches/debian/fanotify-taint-on-use-of-fanotify_access_permissions.patch b/debian/patches/debian/fanotify-taint-on-use-of-fanotify_access_permissions.patch index bb10bbf..6634997 100644 --- a/debian/patches/debian/fanotify-taint-on-use-of-fanotify_access_permissions.patch +++ b/debian/patches/debian/fanotify-taint-on-use-of-fanotify_access_permissions.patch @@ -12,7 +12,7 @@ actually used. --- --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c -@@ -1756,6 +1756,14 @@ static int do_fanotify_mark(int fanotify +@@ -1838,6 +1838,14 @@ static int do_fanotify_mark(int fanotify umask = FANOTIFY_EVENT_FLAGS; } diff --git a/debian/patches/debian/fjes-disable-autoload.patch b/debian/patches/debian/fjes-disable-autoload.patch index 873c690..1f18e37 100644 --- a/debian/patches/debian/fjes-disable-autoload.patch +++ b/debian/patches/debian/fjes-disable-autoload.patch @@ -11,11 +11,9 @@ all the other systems where the same device ID appears, so disable auto-loading. --- -Index: linux/drivers/net/fjes/fjes_main.c -=================================================================== ---- linux.orig/drivers/net/fjes/fjes_main.c -+++ linux/drivers/net/fjes/fjes_main.c -@@ -36,7 +36,7 @@ static const struct acpi_device_id fjes_ +--- a/drivers/net/fjes/fjes_main.c ++++ b/drivers/net/fjes/fjes_main.c +@@ -34,7 +34,7 @@ static const struct acpi_device_id fjes_ {ACPI_MOTHERBOARD_RESOURCE_HID, 0}, {"", 0}, }; diff --git a/debian/patches/debian/fs-enable-link-security-restrictions-by-default.patch b/debian/patches/debian/fs-enable-link-security-restrictions-by-default.patch index eab0e78..f8a9733 100644 --- a/debian/patches/debian/fs-enable-link-security-restrictions-by-default.patch +++ b/debian/patches/debian/fs-enable-link-security-restrictions-by-default.patch @@ -9,7 +9,7 @@ This reverts commit 561ec64ae67ef25cac8d72bb9c4bfc955edfd415 --- a/fs/namei.c +++ b/fs/namei.c -@@ -1020,8 +1020,8 @@ static inline void put_link(struct namei +@@ -1094,8 +1094,8 @@ static inline void put_link(struct namei path_put(&last->link); } diff --git a/debian/patches/debian/hamradio-disable-auto-loading-as-mitigation-against-local-exploits.patch b/debian/patches/debian/hamradio-disable-auto-loading-as-mitigation-against-local-exploits.patch index b532d00..a222019 100644 --- a/debian/patches/debian/hamradio-disable-auto-loading-as-mitigation-against-local-exploits.patch +++ b/debian/patches/debian/hamradio-disable-auto-loading-as-mitigation-against-local-exploits.patch @@ -15,7 +15,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c -@@ -1986,7 +1986,7 @@ module_init(ax25_init); +@@ -2077,7 +2077,7 @@ module_init(ax25_init); MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The amateur radio AX.25 link layer protocol"); MODULE_LICENSE("GPL"); @@ -26,7 +26,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> { --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c -@@ -1486,7 +1486,7 @@ MODULE_PARM_DESC(nr_ndevs, "number of NE +@@ -1498,7 +1498,7 @@ MODULE_PARM_DESC(nr_ndevs, "number of NE MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The amateur radio NET/ROM network and transport layer protocol"); MODULE_LICENSE("GPL"); @@ -37,7 +37,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> { --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c -@@ -1577,7 +1577,7 @@ MODULE_PARM_DESC(rose_ndevs, "number of +@@ -1638,7 +1638,7 @@ MODULE_PARM_DESC(rose_ndevs, "number of MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The amateur radio ROSE network layer protocol"); MODULE_LICENSE("GPL"); diff --git a/debian/patches/debian/iwlwifi-do-not-request-unreleased-firmware.patch b/debian/patches/debian/iwlwifi-do-not-request-unreleased-firmware.patch index c806143..4479cfc 100644 --- a/debian/patches/debian/iwlwifi-do-not-request-unreleased-firmware.patch +++ b/debian/patches/debian/iwlwifi-do-not-request-unreleased-firmware.patch @@ -15,7 +15,7 @@ requesting the unreleased firmware. --- a/drivers/net/wireless/intel/iwlwifi/cfg/6000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/6000.c -@@ -31,7 +31,7 @@ +@@ -12,7 +12,7 @@ #include "dvm/commands.h" /* needed for BT for now */ /* Highest firmware API version supported */ diff --git a/debian/patches/debian/kernelvariables.patch b/debian/patches/debian/kernelvariables.patch index 7162b07..a317e40 100644 --- a/debian/patches/debian/kernelvariables.patch +++ b/debian/patches/debian/kernelvariables.patch @@ -19,7 +19,7 @@ use of $(ARCH) needs to be moved after this. --- --- a/Makefile +++ b/Makefile -@@ -406,36 +406,6 @@ include $(srctree)/scripts/subarch.inclu +@@ -405,36 +405,6 @@ include $(srctree)/scripts/subarch.inclu # Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile ARCH ?= $(SUBARCH) @@ -56,7 +56,7 @@ use of $(ARCH) needs to be moved after this. KCONFIG_CONFIG ?= .config export KCONFIG_CONFIG -@@ -551,6 +521,35 @@ RUSTFLAGS_KERNEL = +@@ -555,6 +525,35 @@ RUSTFLAGS_KERNEL = AFLAGS_KERNEL = LDFLAGS_vmlinux = diff --git a/debian/patches/debian/linux-perf-remove-remaining-source-filenames-from-executable.patch b/debian/patches/debian/linux-perf-remove-remaining-source-filenames-from-executable.patch index e78e7d2..e2b49c6 100644 --- a/debian/patches/debian/linux-perf-remove-remaining-source-filenames-from-executable.patch +++ b/debian/patches/debian/linux-perf-remove-remaining-source-filenames-from-executable.patch @@ -15,7 +15,7 @@ to the installed location. --- --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c -@@ -662,10 +662,12 @@ static int report__browse_hists(struct r +@@ -660,10 +660,12 @@ static int report__browse_hists(struct r path = system_path(TIPDIR); if (perf_tip(&help, path) || help == NULL) { diff --git a/debian/patches/debian/makefile-make-compiler-version-comparison-optional.patch b/debian/patches/debian/makefile-make-compiler-version-comparison-optional.patch index f0d5c80..c967e8a 100644 --- a/debian/patches/debian/makefile-make-compiler-version-comparison-optional.patch +++ b/debian/patches/debian/makefile-make-compiler-version-comparison-optional.patch @@ -20,7 +20,7 @@ is non-empty. --- --- a/Makefile +++ b/Makefile -@@ -1753,7 +1753,7 @@ PHONY += prepare +@@ -1873,7 +1873,7 @@ PHONY += prepare # now expand this into a simple variable to reduce the cost of shell evaluations prepare: CC_VERSION_TEXT := $(CC_VERSION_TEXT) prepare: diff --git a/debian/patches/debian/perf-traceevent-support-asciidoctor-for-documentatio.patch b/debian/patches/debian/perf-traceevent-support-asciidoctor-for-documentatio.patch index a1cbb2c..4af7427 100644 --- a/debian/patches/debian/perf-traceevent-support-asciidoctor-for-documentatio.patch +++ b/debian/patches/debian/perf-traceevent-support-asciidoctor-for-documentatio.patch @@ -9,8 +9,6 @@ Forwarded: not-needed tools/perf/Documentation/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -diff --git a/tools/lib/perf/Documentation/Makefile b/tools/lib/perf/Documentation/Makefile -index 972754082a85..272d06173a3e 100644 --- a/tools/lib/perf/Documentation/Makefile +++ b/tools/lib/perf/Documentation/Makefile @@ -35,7 +35,7 @@ htmldir = $(docdir)/html @@ -22,11 +20,9 @@ index 972754082a85..272d06173a3e 100644 ASCIIDOC_HTML = xhtml11 MANPAGE_XSL = manpage-normal.xsl XMLTO_EXTRA = -diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile -index 6e54979c2124..7bfa6ae971ab 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile -@@ -48,7 +48,7 @@ man5dir=$(mandir)/man5 +@@ -45,7 +45,7 @@ man5dir=$(mandir)/man5 man7dir=$(mandir)/man7 ASCIIDOC=asciidoc @@ -35,6 +31,3 @@ index 6e54979c2124..7bfa6ae971ab 100644 ASCIIDOC_HTML = xhtml11 MANPAGE_XSL = manpage-normal.xsl XMLTO_EXTRA = --- -2.28.0 - diff --git a/debian/patches/debian/rds-Disable-auto-loading-as-mitigation-against-local.patch b/debian/patches/debian/rds-Disable-auto-loading-as-mitigation-against-local.patch index 3343aec..a1fa856 100644 --- a/debian/patches/debian/rds-Disable-auto-loading-as-mitigation-against-local.patch +++ b/debian/patches/debian/rds-Disable-auto-loading-as-mitigation-against-local.patch @@ -19,16 +19,11 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> net/rds/af_rds.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) -diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c -index 98e0538..d8d4525 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c -@@ -574,4 +574,4 @@ MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" +@@ -959,4 +959,4 @@ MODULE_DESCRIPTION("RDS: Reliable Datagr " v" DRV_VERSION " (" DRV_RELDATE ")"); MODULE_VERSION(DRV_VERSION); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_ALIAS_NETPROTO(PF_RDS); +/* MODULE_ALIAS_NETPROTO(PF_RDS); */ --- -1.7.2.3 - diff --git a/debian/patches/debian/tools-perf-install-python-bindings.patch b/debian/patches/debian/tools-perf-install-python-bindings.patch index c3f1747..6e882fb 100644 --- a/debian/patches/debian/tools-perf-install-python-bindings.patch +++ b/debian/patches/debian/tools-perf-install-python-bindings.patch @@ -8,11 +8,9 @@ Forwarded: not-needed tools/perf/Makefile.perf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf -index 80522bcfafe0..b011c7aae742 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf -@@ -1026,7 +1026,7 @@ install-bin: install-tools install-tests install-traceevent-plugins +@@ -1139,7 +1139,7 @@ install-bin: install-tools install-tests install: install-bin try-install-man install-python_ext: @@ -21,6 +19,3 @@ index 80522bcfafe0..b011c7aae742 100644 # 'make install-doc' should call 'make -C Documentation install' $(INSTALL_DOC_TARGETS): --- -2.30.2 - diff --git a/debian/patches/debian/tools-perf-perf-read-vdso-in-libexec.patch b/debian/patches/debian/tools-perf-perf-read-vdso-in-libexec.patch index 044f584..7c996c7 100644 --- a/debian/patches/debian/tools-perf-perf-read-vdso-in-libexec.patch +++ b/debian/patches/debian/tools-perf-perf-read-vdso-in-libexec.patch @@ -4,7 +4,7 @@ Subject: linux-tools: Install perf-read-vdso{,x}32 in directory under /usr/lib --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf -@@ -943,21 +943,21 @@ install-tools: all install-gtk +@@ -1067,21 +1067,21 @@ install-tools: all install-gtk $(LN) '$(DESTDIR_SQ)$(bindir_SQ)/perf' '$(DESTDIR_SQ)$(bindir_SQ)/trace'; \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(includedir_SQ)/perf'; \ $(INSTALL) -m 644 include/perf/perf_dlfilter.h -t '$(DESTDIR_SQ)$(includedir_SQ)/perf' diff --git a/debian/patches/debian/uname-version-timestamp.patch b/debian/patches/debian/uname-version-timestamp.patch index 4be158e..28e8d1d 100644 --- a/debian/patches/debian/uname-version-timestamp.patch +++ b/debian/patches/debian/uname-version-timestamp.patch @@ -11,10 +11,8 @@ kernel image reproducible. Make mkcompile_h use $KBUILD_BUILD_VERSION_TIMESTAMP in preference to $KBUILD_BUILD_TIMESTAMP. -Index: linux/init/Makefile -=================================================================== ---- linux.orig/init/Makefile -+++ linux/init/Makefile +--- a/init/Makefile ++++ b/init/Makefile @@ -29,7 +29,7 @@ preempt-flag-$(CONFIG_PREEMPT_DYNAMIC) : preempt-flag-$(CONFIG_PREEMPT_RT) := PREEMPT_RT diff --git a/debian/patches/debian/wireless-add-debian-wireless-regdb-certificates.patch b/debian/patches/debian/wireless-add-debian-wireless-regdb-certificates.patch index edbf865..2556635 100644 --- a/debian/patches/debian/wireless-add-debian-wireless-regdb-certificates.patch +++ b/debian/patches/debian/wireless-add-debian-wireless-regdb-certificates.patch @@ -15,9 +15,6 @@ This hex dump is generated using: 1 file changed, 1426 insertions(+) create mode 100644 net/wireless/certs/debian.hex -diff --git a/net/wireless/certs/debian.hex b/net/wireless/certs/debian.hex -new file mode 100644 -index 000000000000..c5ab03f8c500 --- /dev/null +++ b/net/wireless/certs/debian.hex @@ -0,0 +1,1426 @@ @@ -1447,6 +1444,3 @@ index 000000000000..c5ab03f8c500 +0x44, +0x9f, +0x21, --- -2.25.1 - diff --git a/debian/patches/debian/yama-disable-by-default.patch b/debian/patches/debian/yama-disable-by-default.patch index 3458714..4dcdfc4 100644 --- a/debian/patches/debian/yama-disable-by-default.patch +++ b/debian/patches/debian/yama-disable-by-default.patch @@ -8,8 +8,6 @@ Forwarded: not-needed security/yama/yama_lsm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c -index efac68556b45..95ff3e778a17 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -25,7 +25,7 @@ @@ -21,7 +19,7 @@ index efac68556b45..95ff3e778a17 100644 /* describe a ptrace relationship for potential exception */ struct ptrace_relation { -@@ -476,7 +476,7 @@ static inline void yama_init_sysctl(void) { } +@@ -474,7 +474,7 @@ static inline void yama_init_sysctl(void static int __init yama_init(void) { diff --git a/debian/patches/features/all/db-mok-keyring/0003-MODSIGN-checking-the-blacklisted-hash-before-loading-a-kernel-module.patch b/debian/patches/features/all/db-mok-keyring/0003-MODSIGN-checking-the-blacklisted-hash-before-loading-a-kernel-module.patch index f2278ce..382dcf0 100644 --- a/debian/patches/features/all/db-mok-keyring/0003-MODSIGN-checking-the-blacklisted-hash-before-loading-a-kernel-module.patch +++ b/debian/patches/features/all/db-mok-keyring/0003-MODSIGN-checking-the-blacklisted-hash-before-loading-a-kernel-module.patch @@ -39,7 +39,7 @@ Signed-off-by: "Lee, Chun-Yi" <jlee@suse.com> #include <uapi/linux/module.h> #include "internal.h" -@@ -37,13 +39,60 @@ +@@ -37,13 +39,60 @@ void set_module_sig_enforced(void) sig_enforce = true; } @@ -101,7 +101,7 @@ Signed-off-by: "Lee, Chun-Yi" <jlee@suse.com> int ret; pr_devel("==>%s(,%zu)\n", __func__, modlen); -@@ -51,6 +100,7 @@ +@@ -51,6 +100,7 @@ int mod_verify_sig(const void *mod, stru if (modlen <= sizeof(ms)) return -EBADMSG; @@ -109,7 +109,7 @@ Signed-off-by: "Lee, Chun-Yi" <jlee@suse.com> memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); ret = mod_check_sig(&ms, modlen, "module"); -@@ -61,10 +111,17 @@ +@@ -61,10 +111,17 @@ int mod_verify_sig(const void *mod, stru modlen -= sig_len + sizeof(ms); info->len = modlen; diff --git a/debian/patches/features/all/db-mok-keyring/trust-machine-keyring-by-default.patch b/debian/patches/features/all/db-mok-keyring/trust-machine-keyring-by-default.patch index c814c6d..c42d138 100644 --- a/debian/patches/features/all/db-mok-keyring/trust-machine-keyring-by-default.patch +++ b/debian/patches/features/all/db-mok-keyring/trust-machine-keyring-by-default.patch @@ -11,8 +11,6 @@ To keep backward compatibility skip this check. security/integrity/platform_certs/machine_keyring.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) -diff --git a/security/integrity/platform_certs/machine_keyring.c b/security/integrity/platform_certs/machine_keyring.c -index a401640a63cd..0627f14eacbe 100644 --- a/security/integrity/platform_certs/machine_keyring.c +++ b/security/integrity/platform_certs/machine_keyring.c @@ -68,10 +68,7 @@ static bool __init trust_moklist(void) diff --git a/debian/patches/features/all/drivers-media-dvb-usb-af9005-request_firmware.patch b/debian/patches/features/all/drivers-media-dvb-usb-af9005-request_firmware.patch index a24ba17..6cdf2ab 100644 --- a/debian/patches/features/all/drivers-media-dvb-usb-af9005-request_firmware.patch +++ b/debian/patches/features/all/drivers-media-dvb-usb-af9005-request_firmware.patch @@ -11,11 +11,9 @@ a version of the script which is directly derived from the driver. drivers/media/dvb/dvb-usb/af9005-fe.c | 66 ++++++++++++++++++++++++++------ 2 files changed, 54 insertions(+), 14 deletions(-) -Index: debian-kernel/drivers/media/usb/dvb-usb/Kconfig -=================================================================== ---- debian-kernel.orig/drivers/media/usb/dvb-usb/Kconfig -+++ debian-kernel/drivers/media/usb/dvb-usb/Kconfig -@@ -260,10 +260,10 @@ config DVB_USB_OPERA1 +--- a/drivers/media/usb/dvb-usb/Kconfig ++++ b/drivers/media/usb/dvb-usb/Kconfig +@@ -35,10 +35,10 @@ config DVB_USB_A800 config DVB_USB_AF9005 tristate "Afatech AF9005 DVB-T USB1.1 support" @@ -27,10 +25,8 @@ Index: debian-kernel/drivers/media/usb/dvb-usb/Kconfig help Say Y here to support the Afatech AF9005 based DVB-T USB1.1 receiver and the TerraTec Cinergy T USB XE (Rev.1) -Index: debian-kernel/drivers/media/usb/dvb-usb/af9005-fe.c -=================================================================== ---- debian-kernel.orig/drivers/media/usb/dvb-usb/af9005-fe.c -+++ debian-kernel/drivers/media/usb/dvb-usb/af9005-fe.c +--- a/drivers/media/usb/dvb-usb/af9005-fe.c ++++ b/drivers/media/usb/dvb-usb/af9005-fe.c @@ -9,10 +9,26 @@ * see Documentation/driver-api/media/drivers/dvb-usb.rst for more information */ diff --git a/debian/patches/features/all/lockdown/arm64-add-kernel-config-option-to-lock-down-when.patch b/debian/patches/features/all/lockdown/arm64-add-kernel-config-option-to-lock-down-when.patch deleted file mode 100644 index 61b7040..0000000 --- a/debian/patches/features/all/lockdown/arm64-add-kernel-config-option-to-lock-down-when.patch +++ /dev/null @@ -1,153 +0,0 @@ -From: Linn Crosetto <linn@hpe.com> -Date: Tue, 30 Aug 2016 11:54:38 -0600 -Subject: arm64: add kernel config option to lock down when in Secure Boot mode -Bug-Debian: https://bugs.debian.org/831827 -Forwarded: no - -Add a kernel configuration option to lock down the kernel, to restrict -userspace's ability to modify the running kernel when UEFI Secure Boot is -enabled. Based on the x86 patch by Matthew Garrett. - -Determine the state of Secure Boot in the EFI stub and pass this to the -kernel using the FDT. - -Signed-off-by: Linn Crosetto <linn@hpe.com> -[bwh: Forward-ported to 4.10: adjust context] -[Lukas Wunner: Forward-ported to 4.11: drop parts applied upstream] -[bwh: Forward-ported to 4.15 and lockdown patch set: - - Pass result of efi_get_secureboot() in stub through to - efi_set_secure_boot() in main kernel - - Use lockdown API and naming] -[bwh: Forward-ported to 4.19.3: adjust context in update_fdt()] -[dannf: Moved init_lockdown() call after uefi_init(), fixing SB detection] -[bwh: Drop call to init_lockdown(), as efi_set_secure_boot() now calls this] -[bwh: Forward-ported to 5.6: efi_get_secureboot() no longer takes a - sys_table parameter] -[bwh: Forward-ported to 5.7: EFI initialisation from FDT was rewritten, so: - - Add Secure Boot mode to the parameter enumeration in fdtparams.c - - Add a parameter to efi_get_fdt_params() to return the Secure Boot mode - - Since Xen does not have a property name defined for Secure Boot mode, - change efi_get_fdt_prop() to handle a missing property name by clearing - the output variable] -[Salvatore Bonaccorso: Forward-ported to 5.10: f30f242fb131 ("efi: Rename -arm-init to efi-init common for all arch") renamed arm-init.c to efi-init.c] ---- - drivers/firmware/efi/efi-init.c | 5 ++++- - drivers/firmware/efi/fdtparams.c | 12 +++++++++++- - drivers/firmware/efi/libstub/fdt.c | 6 ++++++ - include/linux/efi.h | 3 ++- - 4 files changed, 23 insertions(+), 3 deletions(-) - ---- a/drivers/firmware/efi/efi-init.c -+++ b/drivers/firmware/efi/efi-init.c -@@ -213,9 +213,10 @@ void __init efi_init(void) - { - struct efi_memory_map_data data; - u64 efi_system_table; -+ u32 secure_boot; - - /* Grab UEFI information placed in FDT by stub */ -- efi_system_table = efi_get_fdt_params(&data); -+ efi_system_table = efi_get_fdt_params(&data, &secure_boot); - if (!efi_system_table) - return; - -@@ -237,6 +238,8 @@ void __init efi_init(void) - return; - } - -+ efi_set_secure_boot(secure_boot); -+ - reserve_regions(); - /* - * For memblock manipulation, the cap should come after the memblock_add(). ---- a/drivers/firmware/efi/fdtparams.c -+++ b/drivers/firmware/efi/fdtparams.c -@@ -16,6 +16,7 @@ enum { - MMSIZE, - DCSIZE, - DCVERS, -+ SBMODE, - - PARAMCOUNT - }; -@@ -26,6 +27,7 @@ static __initconst const char name[][22] - [MMSIZE] = "MemMap Size ", - [DCSIZE] = "MemMap Desc. Size ", - [DCVERS] = "MemMap Desc. Version ", -+ [SBMODE] = "Secure Boot Enabled ", - }; - - static __initconst const struct { -@@ -43,6 +45,7 @@ static __initconst const struct { - [MMSIZE] = "xen,uefi-mmap-size", - [DCSIZE] = "xen,uefi-mmap-desc-size", - [DCVERS] = "xen,uefi-mmap-desc-ver", -+ [SBMODE] = "", - } - }, { - #endif -@@ -53,6 +56,7 @@ static __initconst const struct { - [MMSIZE] = "linux,uefi-mmap-size", - [DCSIZE] = "linux,uefi-mmap-desc-size", - [DCVERS] = "linux,uefi-mmap-desc-ver", -+ [SBMODE] = "linux,uefi-secure-boot", - } - } - }; -@@ -64,6 +68,11 @@ static int __init efi_get_fdt_prop(const - int len; - u64 val; - -+ if (!pname[0]) { -+ memset(var, 0, size); -+ return 0; -+ } -+ - prop = fdt_getprop(fdt, node, pname, &len); - if (!prop) - return 1; -@@ -81,7 +90,7 @@ static int __init efi_get_fdt_prop(const - return 0; - } - --u64 __init efi_get_fdt_params(struct efi_memory_map_data *mm) -+u64 __init efi_get_fdt_params(struct efi_memory_map_data *mm, u32 *secure_boot) - { - const void *fdt = initial_boot_params; - unsigned long systab; -@@ -95,6 +104,7 @@ u64 __init efi_get_fdt_params(struct efi - [MMSIZE] = { &mm->size, sizeof(mm->size) }, - [DCSIZE] = { &mm->desc_size, sizeof(mm->desc_size) }, - [DCVERS] = { &mm->desc_version, sizeof(mm->desc_version) }, -+ [SBMODE] = { secure_boot, sizeof(*secure_boot) }, - }; - - BUILD_BUG_ON(ARRAY_SIZE(target) != ARRAY_SIZE(name)); ---- a/drivers/firmware/efi/libstub/fdt.c -+++ b/drivers/firmware/efi/libstub/fdt.c -@@ -132,6 +132,12 @@ static efi_status_t update_fdt(void *ori - } - } - -+ fdt_val32 = cpu_to_fdt32(efi_get_secureboot()); -+ status = fdt_setprop(fdt, node, "linux,uefi-secure-boot", -+ &fdt_val32, sizeof(fdt_val32)); -+ if (status) -+ goto fdt_set_fail; -+ - /* Shrink the FDT back to its minimum size: */ - fdt_pack(fdt); - ---- a/include/linux/efi.h -+++ b/include/linux/efi.h -@@ -764,7 +764,8 @@ extern int efi_mem_desc_lookup(u64 phys_ - extern int __efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md); - extern void efi_mem_reserve(phys_addr_t addr, u64 size); - extern int efi_mem_reserve_persistent(phys_addr_t addr, u64 size); --extern u64 efi_get_fdt_params(struct efi_memory_map_data *data); -+extern u64 efi_get_fdt_params(struct efi_memory_map_data *data, -+ u32 *secure_boot); - extern struct kobject *efi_kobj; - - extern int efi_reboot_quirk_mode; diff --git a/debian/patches/features/all/lockdown/efi-add-an-efi_secure_boot-flag-to-indicate-secure-b.patch b/debian/patches/features/all/lockdown/efi-add-an-efi_secure_boot-flag-to-indicate-secure-b.patch deleted file mode 100644 index 077fead..0000000 --- a/debian/patches/features/all/lockdown/efi-add-an-efi_secure_boot-flag-to-indicate-secure-b.patch +++ /dev/null @@ -1,153 +0,0 @@ -From: David Howells <dhowells@redhat.com> -Date: Mon, 18 Feb 2019 12:45:03 +0000 -Subject: [28/30] efi: Add an EFI_SECURE_BOOT flag to indicate secure boot mode -Origin: https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/commit?id=a5d70c55c603233c192b375f72116a395909da28 - -UEFI machines can be booted in Secure Boot mode. Add an EFI_SECURE_BOOT -flag that can be passed to efi_enabled() to find out whether secure boot is -enabled. - -Move the switch-statement in x86's setup_arch() that inteprets the -secure_boot boot parameter to generic code and set the bit there. - -Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> -Signed-off-by: David Howells <dhowells@redhat.com> -Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> -cc: linux-efi@vger.kernel.org -[rperier: Forward-ported to 5.5: - - Use pr_warn() - - Adjust context] -[bwh: Forward-ported to 5.6: adjust context] -[bwh: Forward-ported to 5.7: - - Use the next available bit in efi.flags - - Adjust context] ---- - arch/x86/kernel/setup.c | 14 +---------- - drivers/firmware/efi/Makefile | 1 + - drivers/firmware/efi/secureboot.c | 39 +++++++++++++++++++++++++++++++ - include/linux/efi.h | 16 ++++++++----- - 4 files changed, 51 insertions(+), 19 deletions(-) - create mode 100644 drivers/firmware/efi/secureboot.c - ---- a/arch/x86/kernel/setup.c -+++ b/arch/x86/kernel/setup.c -@@ -1070,19 +1070,7 @@ void __init setup_arch(char **cmdline_p) - /* Allocate bigger log buffer */ - setup_log_buf(1); - -- if (efi_enabled(EFI_BOOT)) { -- switch (boot_params.secure_boot) { -- case efi_secureboot_mode_disabled: -- pr_info("Secure boot disabled\n"); -- break; -- case efi_secureboot_mode_enabled: -- pr_info("Secure boot enabled\n"); -- break; -- default: -- pr_info("Secure boot could not be determined\n"); -- break; -- } -- } -+ efi_set_secure_boot(boot_params.secure_boot); - - reserve_initrd(); - ---- a/drivers/firmware/efi/Makefile -+++ b/drivers/firmware/efi/Makefile -@@ -25,6 +25,7 @@ subdir-$(CONFIG_EFI_STUB) += libstub - obj-$(CONFIG_EFI_BOOTLOADER_CONTROL) += efibc.o - obj-$(CONFIG_EFI_TEST) += test/ - obj-$(CONFIG_EFI_DEV_PATH_PARSER) += dev-path-parser.o -+obj-$(CONFIG_EFI) += secureboot.o - obj-$(CONFIG_APPLE_PROPERTIES) += apple-properties.o - obj-$(CONFIG_EFI_RCI2_TABLE) += rci2-table.o - obj-$(CONFIG_EFI_EMBEDDED_FIRMWARE) += embedded-firmware.o ---- /dev/null -+++ b/drivers/firmware/efi/secureboot.c -@@ -0,0 +1,39 @@ -+ -+/* Core kernel secure boot support. -+ * -+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. -+ * Written by David Howells (dhowells@redhat.com) -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public Licence -+ * as published by the Free Software Foundation; either version -+ * 2 of the Licence, or (at your option) any later version. -+ */ -+ -+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -+ -+#include <linux/efi.h> -+#include <linux/kernel.h> -+#include <linux/printk.h> -+ -+/* -+ * Decide what to do when UEFI secure boot mode is enabled. -+ */ -+void __init efi_set_secure_boot(enum efi_secureboot_mode mode) -+{ -+ if (efi_enabled(EFI_BOOT)) { -+ switch (mode) { -+ case efi_secureboot_mode_disabled: -+ pr_info("Secure boot disabled\n"); -+ break; -+ case efi_secureboot_mode_enabled: -+ set_bit(EFI_SECURE_BOOT, &efi.flags); -+ pr_info("Secure boot enabled\n"); -+ break; -+ default: -+ pr_warn("Secure boot could not be determined (mode %u)\n", -+ mode); -+ break; -+ } -+ } -+} ---- a/include/linux/efi.h -+++ b/include/linux/efi.h -@@ -864,6 +864,14 @@ static inline int efi_range_is_wc(unsign - #define EFI_MEM_ATTR 9 /* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */ - #define EFI_MEM_NO_SOFT_RESERVE 10 /* Is the kernel configured to ignore soft reservations? */ - #define EFI_PRESERVE_BS_REGIONS 11 /* Are EFI boot-services memory segments available? */ -+#define EFI_SECURE_BOOT 12 /* Are we in Secure Boot mode? */ -+ -+enum efi_secureboot_mode { -+ efi_secureboot_mode_unset, -+ efi_secureboot_mode_unknown, -+ efi_secureboot_mode_disabled, -+ efi_secureboot_mode_enabled, -+}; - - #ifdef CONFIG_EFI - /* -@@ -888,6 +896,7 @@ static inline bool efi_rt_services_suppo - return (efi.runtime_supported_mask & mask) == mask; - } - extern void efi_find_mirror(void); -+extern void __init efi_set_secure_boot(enum efi_secureboot_mode mode); - #else - static inline bool efi_enabled(int feature) - { -@@ -907,6 +916,7 @@ static inline bool efi_rt_services_suppo - } - - static inline void efi_find_mirror(void) {} -+static inline void efi_set_secure_boot(enum efi_secureboot_mode mode) {} - #endif - - extern int efi_status_to_err(efi_status_t status); -@@ -1125,13 +1135,6 @@ static inline bool efi_runtime_disabled( - extern void efi_call_virt_check_flags(unsigned long flags, const void *caller); - extern unsigned long efi_call_virt_save_flags(void); - --enum efi_secureboot_mode { -- efi_secureboot_mode_unset, -- efi_secureboot_mode_unknown, -- efi_secureboot_mode_disabled, -- efi_secureboot_mode_enabled, --}; -- - static inline - enum efi_secureboot_mode efi_get_secureboot_mode(efi_get_variable_t *get_var) - { diff --git a/debian/patches/features/all/lockdown/efi-lock-down-the-kernel-if-booted-in-secure-boot-mo.patch b/debian/patches/features/all/lockdown/efi-lock-down-the-kernel-if-booted-in-secure-boot-mo.patch deleted file mode 100644 index 6fff3f8..0000000 --- a/debian/patches/features/all/lockdown/efi-lock-down-the-kernel-if-booted-in-secure-boot-mo.patch +++ /dev/null @@ -1,121 +0,0 @@ -From: Ben Hutchings <ben@decadent.org.uk> -Date: Tue, 10 Sep 2019 11:54:28 +0100 -Subject: efi: Lock down the kernel if booted in secure boot mode - -Based on an earlier patch by David Howells, who wrote the following -description: - -> UEFI Secure Boot provides a mechanism for ensuring that the firmware will -> only load signed bootloaders and kernels. Certain use cases may also -> require that all kernel modules also be signed. Add a configuration option -> that to lock down the kernel - which includes requiring validly signed -> modules - if the kernel is secure-booted. - -Signed-off-by: Ben Hutchings <ben@decadent.org.uk> -[Salvatore Bonaccorso: After fixing https://bugs.debian.org/956197 the -help text for LOCK_DOWN_IN_EFI_SECURE_BOOT was adjusted to mention that -lockdown is triggered in integrity mode (https://bugs.debian.org/1025417)] -Signed-off-by: Salvatore Bonaccorso <carnil@debian.org> ---- - arch/x86/kernel/setup.c | 4 ++-- - drivers/firmware/efi/secureboot.c | 3 +++ - include/linux/security.h | 6 ++++++ - security/lockdown/Kconfig | 15 +++++++++++++++ - security/lockdown/lockdown.c | 2 +- - 5 files changed, 27 insertions(+), 3 deletions(-) - ---- a/arch/x86/kernel/setup.c -+++ b/arch/x86/kernel/setup.c -@@ -904,6 +904,8 @@ void __init setup_arch(char **cmdline_p) - if (efi_enabled(EFI_BOOT)) - efi_init(); - -+ efi_set_secure_boot(boot_params.secure_boot); -+ - reserve_ibft_region(); - x86_init.resources.dmi_setup(); - -@@ -1070,8 +1072,6 @@ void __init setup_arch(char **cmdline_p) - /* Allocate bigger log buffer */ - setup_log_buf(1); - -- efi_set_secure_boot(boot_params.secure_boot); -- - reserve_initrd(); - - acpi_table_upgrade(); ---- a/drivers/firmware/efi/secureboot.c -+++ b/drivers/firmware/efi/secureboot.c -@@ -15,6 +15,7 @@ - #include <linux/efi.h> - #include <linux/kernel.h> - #include <linux/printk.h> -+#include <linux/security.h> - - /* - * Decide what to do when UEFI secure boot mode is enabled. -@@ -28,6 +29,10 @@ void __init efi_set_secure_boot(enum efi - break; - case efi_secureboot_mode_enabled: - set_bit(EFI_SECURE_BOOT, &efi.flags); -+#ifdef CONFIG_LOCK_DOWN_IN_EFI_SECURE_BOOT -+ lock_kernel_down("EFI Secure Boot", -+ LOCKDOWN_INTEGRITY_MAX); -+#endif - pr_info("Secure boot enabled\n"); - break; - default: ---- a/include/linux/security.h -+++ b/include/linux/security.h -@@ -522,6 +522,7 @@ int security_inode_notifysecctx(struct i - int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); - int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); - int security_locked_down(enum lockdown_reason what); -+int lock_kernel_down(const char *where, enum lockdown_reason level); - int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len, - void *val, size_t val_len, u64 id, u64 flags); - int security_bdev_alloc(struct block_device *bdev); -@@ -1504,6 +1505,11 @@ static inline int security_locked_down(e - { - return 0; - } -+static inline int -+lock_kernel_down(const char *where, enum lockdown_reason level) -+{ -+ return -EOPNOTSUPP; -+} - static inline int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, - u32 *uctx_len, void *val, size_t val_len, - u64 id, u64 flags) ---- a/security/lockdown/Kconfig -+++ b/security/lockdown/Kconfig -@@ -45,3 +45,18 @@ config LOCK_DOWN_KERNEL_FORCE_CONFIDENTI - disabled. - - endchoice -+ -+config LOCK_DOWN_IN_EFI_SECURE_BOOT -+ bool "Lock down the kernel in EFI Secure Boot mode" -+ default n -+ depends on SECURITY_LOCKDOWN_LSM -+ depends on EFI -+ select SECURITY_LOCKDOWN_LSM_EARLY -+ help -+ UEFI Secure Boot provides a mechanism for ensuring that the firmware -+ will only load signed bootloaders and kernels. Secure boot mode may -+ be determined from EFI variables provided by the system firmware if -+ not indicated by the boot parameters. -+ -+ Enabling this option results in kernel lockdown being -+ triggered in integrity mode if EFI Secure Boot is set. ---- a/security/lockdown/lockdown.c -+++ b/security/lockdown/lockdown.c -@@ -24,7 +24,7 @@ static const enum lockdown_reason lockdo - /* - * Put the kernel into lock-down mode. - */ --static int lock_kernel_down(const char *where, enum lockdown_reason level) -+int lock_kernel_down(const char *where, enum lockdown_reason level) - { - if (kernel_locked_down >= level) - return -EPERM; diff --git a/debian/patches/features/all/lockdown/mtd-disable-slram-and-phram-when-locked-down.patch b/debian/patches/features/all/lockdown/mtd-disable-slram-and-phram-when-locked-down.patch deleted file mode 100644 index c718e7e..0000000 --- a/debian/patches/features/all/lockdown/mtd-disable-slram-and-phram-when-locked-down.patch +++ /dev/null @@ -1,75 +0,0 @@ -From: Ben Hutchings <ben@decadent.org.uk> -Date: Fri, 30 Aug 2019 15:54:24 +0100 -Subject: mtd: phram,slram: Disable when the kernel is locked down -Forwarded: https://lore.kernel.org/linux-security-module/20190830154720.eekfjt6c4jzvlbfz@decadent.org.uk/ - -These drivers allow mapping arbitrary memory ranges as MTD devices. -This should be disabled to preserve the kernel's integrity when it is -locked down. - -* Add the HWPARAM flag to the module parameters -* When slram is built-in, it uses __setup() to read kernel parameters, - so add an explicit check security_locked_down() check - -Signed-off-by: Ben Hutchings <ben@decadent.org.uk> -Cc: Matthew Garrett <mjg59@google.com> -Cc: David Howells <dhowells@redhat.com> -Cc: Joern Engel <joern@lazybastard.org> -Cc: linux-mtd@lists.infradead.org ---- - drivers/mtd/devices/phram.c | 6 +++++- - drivers/mtd/devices/slram.c | 9 ++++++++- - 2 files changed, 13 insertions(+), 2 deletions(-) - ---- a/drivers/mtd/devices/phram.c -+++ b/drivers/mtd/devices/phram.c -@@ -364,7 +364,11 @@ static int phram_param_call(const char * - #endif - } - --module_param_call(phram, phram_param_call, NULL, NULL, 0200); -+static const struct kernel_param_ops phram_param_ops = { -+ .set = phram_param_call -+}; -+__module_param_call(MODULE_PARAM_PREFIX, phram, &phram_param_ops, NULL, -+ 0200, -1, KERNEL_PARAM_FL_HWPARAM | hwparam_iomem); - MODULE_PARM_DESC(phram, "Memory region to map. \"phram=<name>,<start>,<length>[,<erasesize>]\""); - - #ifdef CONFIG_OF ---- a/drivers/mtd/devices/slram.c -+++ b/drivers/mtd/devices/slram.c -@@ -43,6 +43,7 @@ - #include <linux/ioctl.h> - #include <linux/init.h> - #include <linux/io.h> -+#include <linux/security.h> - - #include <linux/mtd/mtd.h> - -@@ -65,7 +66,7 @@ typedef struct slram_mtd_list { - #ifdef MODULE - static char *map[SLRAM_MAX_DEVICES_PARAMS]; - --module_param_array(map, charp, NULL, 0); -+module_param_hw_array(map, charp, iomem, NULL, 0); - MODULE_PARM_DESC(map, "List of memory regions to map. \"map=<name>, <start>, <length / end>\""); - #else - static char *map; -@@ -281,11 +282,17 @@ static int __init init_slram(void) - #ifndef MODULE - char *devstart; - char *devlength; -+ int ret; - - if (!map) { - E("slram: not enough parameters.\n"); - return(-EINVAL); - } -+ -+ ret = security_locked_down(LOCKDOWN_MODULE_PARAMETERS); -+ if (ret) -+ return ret; -+ - while (map) { - devname = devstart = devlength = NULL; - diff --git a/debian/patches/features/all/security-perf-allow-further-restriction-of-perf_event_open.patch b/debian/patches/features/all/security-perf-allow-further-restriction-of-perf_event_open.patch index 5aeae30..cf91c61 100644 --- a/debian/patches/features/all/security-perf-allow-further-restriction-of-perf_event_open.patch +++ b/debian/patches/features/all/security-perf-allow-further-restriction-of-perf_event_open.patch @@ -22,7 +22,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h -@@ -1617,6 +1617,11 @@ int perf_cpu_time_max_percent_handler(co +@@ -1659,6 +1659,11 @@ int perf_cpu_time_max_percent_handler(co int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); @@ -50,7 +50,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ -@@ -12681,6 +12686,9 @@ SYSCALL_DEFINE5(perf_event_open, +@@ -12821,6 +12826,9 @@ SYSCALL_DEFINE5(perf_event_open, if (err) return err; diff --git a/debian/patches/features/x86/intel-iommu-add-kconfig-option-to-exclude-igpu-by-default.patch b/debian/patches/features/x86/intel-iommu-add-kconfig-option-to-exclude-igpu-by-default.patch index fedbab5..ea825ed 100644 --- a/debian/patches/features/x86/intel-iommu-add-kconfig-option-to-exclude-igpu-by-default.patch +++ b/debian/patches/features/x86/intel-iommu-add-kconfig-option-to-exclude-igpu-by-default.patch @@ -15,7 +15,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig -@@ -57,13 +57,24 @@ config INTEL_IOMMU_SVM +@@ -56,13 +56,24 @@ config INTEL_IOMMU_SVM to access DMA resources through process address space by means of a Process Address Space ID (PASID). @@ -48,7 +48,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> def_bool y --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c -@@ -218,13 +218,13 @@ static LIST_HEAD(dmar_satc_units); +@@ -204,13 +204,13 @@ static LIST_HEAD(dmar_satc_units); static void intel_iommu_domain_free(struct iommu_domain *domain); @@ -64,7 +64,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> static int intel_iommu_superpage = 1; static int iommu_identity_mapping; static int iommu_skip_te_disable; -@@ -263,6 +263,7 @@ static int __init intel_iommu_setup(char +@@ -249,6 +249,7 @@ static int __init intel_iommu_setup(char while (*str) { if (!strncmp(str, "on", 2)) { dmar_disabled = 0; diff --git a/debian/patches/features/x86/intel-iommu-add-option-to-exclude-integrated-gpu-only.patch b/debian/patches/features/x86/intel-iommu-add-option-to-exclude-integrated-gpu-only.patch index 2b45fc7..0f8e646 100644 --- a/debian/patches/features/x86/intel-iommu-add-option-to-exclude-integrated-gpu-only.patch +++ b/debian/patches/features/x86/intel-iommu-add-option-to-exclude-integrated-gpu-only.patch @@ -22,7 +22,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2218,6 +2218,8 @@ +@@ -2247,6 +2247,8 @@ bypassed by not enabling DMAR with this option. In this case, gfx device will use physical address for DMA. @@ -33,7 +33,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> sp_off [Default Off] --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c -@@ -36,6 +36,9 @@ +@@ -35,6 +35,9 @@ #define CONTEXT_SIZE VTD_PAGE_SIZE #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) @@ -43,7 +43,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) -@@ -208,12 +211,14 @@ int intel_iommu_sm = IS_ENABLED(CONFIG_I +@@ -207,12 +210,14 @@ int intel_iommu_sm = IS_ENABLED(CONFIG_I int intel_iommu_enabled = 0; EXPORT_SYMBOL_GPL(intel_iommu_enabled); @@ -58,7 +58,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> const struct iommu_ops intel_iommu_ops; static const struct iommu_dirty_ops intel_dirty_ops; -@@ -253,6 +258,9 @@ static int __init intel_iommu_setup(char +@@ -252,6 +257,9 @@ static int __init intel_iommu_setup(char } else if (!strncmp(str, "igfx_off", 8)) { disable_igfx_iommu = 1; pr_info("Disable GFX device mapping\n"); @@ -68,7 +68,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> } else if (!strncmp(str, "forcedac", 8)) { pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); iommu_dma_forcedac = true; -@@ -2034,6 +2042,9 @@ static int device_def_domain_type(struct +@@ -1902,6 +1910,9 @@ static int device_def_domain_type(struct if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) return IOMMU_DOMAIN_IDENTITY; @@ -78,7 +78,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> } return 0; -@@ -2332,6 +2343,9 @@ static int __init init_dmars(void) +@@ -2196,6 +2207,9 @@ static int __init init_dmars(void) iommu_set_root_entry(iommu); } diff --git a/debian/patches/features/x86/x86-make-x32-syscall-support-conditional.patch b/debian/patches/features/x86/x86-make-x32-syscall-support-conditional.patch index b4dd9b9..4e20d8c 100644 --- a/debian/patches/features/x86/x86-make-x32-syscall-support-conditional.patch +++ b/debian/patches/features/x86/x86-make-x32-syscall-support-conditional.patch @@ -29,7 +29,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -6498,6 +6498,10 @@ +@@ -6982,6 +6982,10 @@ later by a loaded module cannot be set this way. Example: sysctl.vm.swappiness=40 @@ -42,7 +42,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> Ignore sysrq setting - this boot parameter will --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig -@@ -3058,6 +3058,14 @@ config COMPAT_32 +@@ -3186,6 +3186,14 @@ config COMPAT_32 select HAVE_UID16 select OLD_SIGSUSPEND3 @@ -80,7 +80,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> #include <linux/syscalls.h> #include <asm/syscall.h> -@@ -20,3 +23,46 @@ +@@ -23,3 +26,46 @@ long x32_sys_call(const struct pt_regs * default: return __x64_sys_ni_syscall(regs); } }; @@ -159,7 +159,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> -@@ -28,6 +29,18 @@ extern const sys_call_ptr_t ia32_sys_cal +@@ -28,6 +29,18 @@ extern long ia32_sys_call(const struct p extern long x32_sys_call(const struct pt_regs *, unsigned int nr); extern long x64_sys_call(const struct pt_regs *, unsigned int nr); diff --git a/debian/patches/features/x86/x86-memtest-WARN-if-bad-RAM-found.patch b/debian/patches/features/x86/x86-memtest-WARN-if-bad-RAM-found.patch index e8bc1b0..79dc51b 100644 --- a/debian/patches/features/x86/x86-memtest-WARN-if-bad-RAM-found.patch +++ b/debian/patches/features/x86/x86-memtest-WARN-if-bad-RAM-found.patch @@ -15,7 +15,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk> --- a/mm/memtest.c +++ b/mm/memtest.c -@@ -26,6 +26,10 @@ static u64 patterns[] __initdata = { +@@ -31,6 +31,10 @@ static u64 patterns[] __initdata = { static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) { diff --git a/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch b/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch index e8e28cd..1381e58 100644 --- a/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch +++ b/debian/patches/krd/0001-Revert-objtool-dont-fail-the-kernel-build-on-fatal-errors.patch @@ -30,7 +30,7 @@ this reverts following commit: --- a/tools/objtool/check.c +++ b/tools/objtool/check.c -@@ -4897,10 +4897,14 @@ int check(struct objtool_file *file) +@@ -4771,10 +4771,14 @@ int check(struct objtool_file *file) } out: diff --git a/debian/patches/krd/0003-local-ports.patch b/debian/patches/krd/0003-local-ports.patch index e471cc7..7cee0be 100644 --- a/debian/patches/krd/0003-local-ports.patch +++ b/debian/patches/krd/0003-local-ports.patch @@ -1,6 +1,6 @@ --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c -@@ -1800,7 +1800,7 @@ static __net_init int inet_init_net(stru +@@ -1790,7 +1790,7 @@ static __net_init int inet_init_net(stru /* * Set defaults for local port range */ diff --git a/debian/patches/krd/0004-bridge-group_fwd_mask.patch b/debian/patches/krd/0004-bridge-group_fwd_mask.patch index 61ac436..6b4f1a6 100644 --- a/debian/patches/krd/0004-bridge-group_fwd_mask.patch +++ b/debian/patches/krd/0004-bridge-group_fwd_mask.patch @@ -1,9 +1,9 @@ --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c -@@ -374,7 +374,11 @@ static rx_handler_result_t br_handle_fra - return RX_HANDLER_PASS; +@@ -383,7 +383,11 @@ static rx_handler_result_t br_handle_fra case 0x01: /* IEEE MAC (Pause) */ + reason = SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL; - goto drop; + fwd_mask |= p->br->group_fwd_mask; + if (fwd_mask & (1u << dest[5])) diff --git a/debian/patches/misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch b/debian/patches/misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch deleted file mode 100644 index 3895c18..0000000 --- a/debian/patches/misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch +++ /dev/null @@ -1,42 +0,0 @@ -From d53b209e5fcee3b3c53c30a4cc4fcc8e392a8fce Mon Sep 17 00:00:00 2001 -From: Alexandre Frade <kernel@xanmod.org> -Date: Mon, 11 Mar 2024 12:01:13 -0300 -Subject: [PATCH 19/19] x86/cfi,bpf: Add tso_segs and skb_marked_lost to - bpf_struct_ops CFI - -Rebased-by: Oleksandr Natalenko <oleksandr@natalenko.name> -[ https://github.com/sirlucjan/kernel-patches/blob/master/6.8/bbr3-patches/0001-tcp-bbr3-initial-import.patch ] -Signed-off-by: Alexandre Frade <kernel@xanmod.org> ---- - net/ipv4/bpf_tcp_ca.c | 9 +++++++-- - 1 file changed, 7 insertions(+), 2 deletions(-) - ---- a/net/ipv4/bpf_tcp_ca.c -+++ b/net/ipv4/bpf_tcp_ca.c -@@ -280,11 +280,15 @@ static void bpf_tcp_ca_pkts_acked(struct - { - } - --static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) -+static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) - { - return 0; - } - -+static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) -+{ -+} -+ - static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, - const struct rate_sample *rs) - { -@@ -315,7 +319,8 @@ static struct tcp_congestion_ops __bpf_o - .cwnd_event = bpf_tcp_ca_cwnd_event, - .in_ack_event = bpf_tcp_ca_in_ack_event, - .pkts_acked = bpf_tcp_ca_pkts_acked, -- .min_tso_segs = bpf_tcp_ca_min_tso_segs, -+ .tso_segs = bpf_tcp_ca_tso_segs, -+ .skb_marked_lost = bpf_tcp_ca_skb_marked_lost, - .cong_control = bpf_tcp_ca_cong_control, - .undo_cwnd = bpf_tcp_ca_undo_cwnd, - .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, diff --git a/debian/patches/misc-ntsync7/0001-ntsync-Return-the-fd-from-NTSYNC_IOC_CREATE_SEM.patch b/debian/patches/misc-ntsync7/0001-ntsync-Return-the-fd-from-NTSYNC_IOC_CREATE_SEM.patch deleted file mode 100644 index c341f2b..0000000 --- a/debian/patches/misc-ntsync7/0001-ntsync-Return-the-fd-from-NTSYNC_IOC_CREATE_SEM.patch +++ /dev/null @@ -1,55 +0,0 @@ -From e50ffd43b88d64b8063a9fce59f1d03b56f6144c Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:42 -0600 -Subject: ntsync: Return the fd from NTSYNC_IOC_CREATE_SEM. - -Simplify the user API a bit by returning the fd as return value from the ioctl -instead of through the argument pointer. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 7 ++----- - include/uapi/linux/ntsync.h | 3 +-- - 2 files changed, 3 insertions(+), 7 deletions(-) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -165,7 +165,6 @@ static int ntsync_obj_get_fd(struct ntsy - - static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) - { -- struct ntsync_sem_args __user *user_args = argp; - struct ntsync_sem_args args; - struct ntsync_obj *sem; - int fd; -@@ -182,12 +181,10 @@ static int ntsync_create_sem(struct ntsy - sem->u.sem.count = args.count; - sem->u.sem.max = args.max; - fd = ntsync_obj_get_fd(sem); -- if (fd < 0) { -+ if (fd < 0) - kfree(sem); -- return fd; -- } - -- return put_user(fd, &user_args->sem); -+ return fd; - } - - static int ntsync_char_open(struct inode *inode, struct file *file) ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -11,12 +11,11 @@ - #include <linux/types.h> - - struct ntsync_sem_args { -- __u32 sem; - __u32 count; - __u32 max; - }; - --#define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args) -+#define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) - - #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) - diff --git a/debian/patches/misc-ntsync7/0002-ntsync-Rename-NTSYNC_IOC_SEM_POST-to-NTSYNC_IOC_SEM_.patch b/debian/patches/misc-ntsync7/0002-ntsync-Rename-NTSYNC_IOC_SEM_POST-to-NTSYNC_IOC_SEM_.patch deleted file mode 100644 index 53ac42e..0000000 --- a/debian/patches/misc-ntsync7/0002-ntsync-Rename-NTSYNC_IOC_SEM_POST-to-NTSYNC_IOC_SEM_.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 160e9bf7826da868ae4de261753a03cce2208ff6 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:43 -0600 -Subject: ntsync: Rename NTSYNC_IOC_SEM_POST to NTSYNC_IOC_SEM_RELEASE. - -Use the more common "release" terminology, which is also the term used by NT, -instead of "post" (which is used by POSIX). - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 10 +++++----- - include/uapi/linux/ntsync.h | 2 +- - 2 files changed, 6 insertions(+), 6 deletions(-) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -57,7 +57,7 @@ struct ntsync_device { - * Actually change the semaphore state, returning -EOVERFLOW if it is made - * invalid. - */ --static int post_sem_state(struct ntsync_obj *sem, __u32 count) -+static int release_sem_state(struct ntsync_obj *sem, __u32 count) - { - __u32 sum; - -@@ -71,7 +71,7 @@ static int post_sem_state(struct ntsync_ - return 0; - } - --static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) -+static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp) - { - __u32 __user *user_args = argp; - __u32 prev_count; -@@ -87,7 +87,7 @@ static int ntsync_sem_post(struct ntsync - spin_lock(&sem->lock); - - prev_count = sem->u.sem.count; -- ret = post_sem_state(sem, args); -+ ret = release_sem_state(sem, args); - - spin_unlock(&sem->lock); - -@@ -114,8 +114,8 @@ static long ntsync_obj_ioctl(struct file - void __user *argp = (void __user *)parm; - - switch (cmd) { -- case NTSYNC_IOC_SEM_POST: -- return ntsync_sem_post(obj, argp); -+ case NTSYNC_IOC_SEM_RELEASE: -+ return ntsync_sem_release(obj, argp); - default: - return -ENOIOCTLCMD; - } ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -17,6 +17,6 @@ struct ntsync_sem_args { - - #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) - --#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) -+#define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) - - #endif diff --git a/debian/patches/misc-ntsync7/0003-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch b/debian/patches/misc-ntsync7/0003-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch deleted file mode 100644 index 2507b2c..0000000 --- a/debian/patches/misc-ntsync7/0003-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch +++ /dev/null @@ -1,377 +0,0 @@ -From e855a17ec837cdee9047e6e23e47ed7b4312a265 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:44 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ANY. - -This corresponds to part of the functionality of the NT syscall -NtWaitForMultipleObjects(). Specifically, it implements the behaviour where -the third argument (wait_any) is TRUE, and it does not handle alertable waits. -Those features have been split out into separate patches to ease review. - -This patch therefore implements the wait/wake infrastructure which comprises the -core of ntsync's functionality. - -NTSYNC_IOC_WAIT_ANY is a vectored wait function similar to poll(). Unlike -poll(), it "consumes" objects when they are signaled. For semaphores, this means -decreasing one from the internal counter. At most one object can be consumed by -this function. - -This wait/wake model is fundamentally different from that used anywhere else in -the kernel, and for that reason ntsync does not use any existing infrastructure, -such as futexes, kernel mutexes or semaphores, or wait_event(). - -Up to 64 objects can be waited on at once. As soon as one is signaled, the -object with the lowest index is consumed, and that index is returned via the -"index" field. - -A timeout is supported. The timeout is passed as a u64 nanosecond value, which -represents absolute time measured against either the MONOTONIC or REALTIME clock -(controlled by the flags argument). If U64_MAX is passed, the ioctl waits -indefinitely. - -This ioctl validates that all objects belong to the relevant device. This is not -necessary for any technical reason related to NTSYNC_IOC_WAIT_ANY, but will be -necessary for NTSYNC_IOC_WAIT_ALL introduced in the following patch. - -Some padding fields are added for alignment and for fields which will be added -in future patches (split out to ease review). - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 247 +++++++++++++++++++++++++++++++++++- - include/uapi/linux/ntsync.h | 14 ++ - 2 files changed, 260 insertions(+), 1 deletion(-) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -6,11 +6,16 @@ - */ - - #include <linux/anon_inodes.h> -+#include <linux/atomic.h> - #include <linux/file.h> - #include <linux/fs.h> -+#include <linux/hrtimer.h> -+#include <linux/ktime.h> - #include <linux/miscdevice.h> - #include <linux/module.h> - #include <linux/overflow.h> -+#include <linux/sched.h> -+#include <linux/sched/signal.h> - #include <linux/slab.h> - #include <linux/spinlock.h> - #include <uapi/linux/ntsync.h> -@@ -30,6 +35,8 @@ enum ntsync_type { - * - * Both rely on struct file for reference counting. Individual - * ntsync_obj objects take a reference to the device when created. -+ * Wait operations take a reference to each object being waited on for -+ * the duration of the wait. - */ - - struct ntsync_obj { -@@ -47,12 +54,55 @@ struct ntsync_obj { - __u32 max; - } sem; - } u; -+ -+ struct list_head any_waiters; -+}; -+ -+struct ntsync_q_entry { -+ struct list_head node; -+ struct ntsync_q *q; -+ struct ntsync_obj *obj; -+ __u32 index; -+}; -+ -+struct ntsync_q { -+ struct task_struct *task; -+ -+ /* -+ * Protected via atomic_try_cmpxchg(). Only the thread that wins the -+ * compare-and-swap may actually change object states and wake this -+ * task. -+ */ -+ atomic_t signaled; -+ -+ __u32 count; -+ struct ntsync_q_entry entries[]; - }; - - struct ntsync_device { - struct file *file; - }; - -+static void try_wake_any_sem(struct ntsync_obj *sem) -+{ -+ struct ntsync_q_entry *entry; -+ -+ lockdep_assert_held(&sem->lock); -+ -+ list_for_each_entry(entry, &sem->any_waiters, node) { -+ struct ntsync_q *q = entry->q; -+ int signaled = -1; -+ -+ if (!sem->u.sem.count) -+ break; -+ -+ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { -+ sem->u.sem.count--; -+ wake_up_process(q->task); -+ } -+ } -+} -+ - /* - * Actually change the semaphore state, returning -EOVERFLOW if it is made - * invalid. -@@ -87,7 +137,9 @@ static int ntsync_sem_release(struct nts - spin_lock(&sem->lock); - - prev_count = sem->u.sem.count; -- ret = release_sem_state(sem, args); -+ ret = post_sem_state(sem, args); -+ if (!ret) -+ try_wake_any_sem(sem); - - spin_unlock(&sem->lock); - -@@ -140,6 +192,7 @@ static struct ntsync_obj *ntsync_alloc_o - obj->dev = dev; - get_file(dev->file); - spin_lock_init(&obj->lock); -+ INIT_LIST_HEAD(&obj->any_waiters); - - return obj; - } -@@ -187,6 +240,196 @@ static int ntsync_create_sem(struct ntsy - return fd; - } - -+static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) -+{ -+ struct file *file = fget(fd); -+ struct ntsync_obj *obj; -+ -+ if (!file) -+ return NULL; -+ -+ if (file->f_op != &ntsync_obj_fops) { -+ fput(file); -+ return NULL; -+ } -+ -+ obj = file->private_data; -+ if (obj->dev != dev) { -+ fput(file); -+ return NULL; -+ } -+ -+ return obj; -+} -+ -+static void put_obj(struct ntsync_obj *obj) -+{ -+ fput(obj->file); -+} -+ -+static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args) -+{ -+ ktime_t timeout = ns_to_ktime(args->timeout); -+ clockid_t clock = CLOCK_MONOTONIC; -+ ktime_t *timeout_ptr; -+ int ret = 0; -+ -+ timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout); -+ -+ if (args->flags & NTSYNC_WAIT_REALTIME) -+ clock = CLOCK_REALTIME; -+ -+ do { -+ if (signal_pending(current)) { -+ ret = -ERESTARTSYS; -+ break; -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (atomic_read(&q->signaled) != -1) { -+ ret = 0; -+ break; -+ } -+ ret = schedule_hrtimeout_range_clock(timeout_ptr, 0, HRTIMER_MODE_ABS, clock); -+ } while (ret < 0); -+ __set_current_state(TASK_RUNNING); -+ -+ return ret; -+} -+ -+/* -+ * Allocate and initialize the ntsync_q structure, but do not queue us yet. -+ */ -+static int setup_wait(struct ntsync_device *dev, -+ const struct ntsync_wait_args *args, -+ struct ntsync_q **ret_q) -+{ -+ const __u32 count = args->count; -+ int fds[NTSYNC_MAX_WAIT_COUNT]; -+ struct ntsync_q *q; -+ __u32 i, j; -+ -+ if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME)) -+ return -EINVAL; -+ -+ if (args->count > NTSYNC_MAX_WAIT_COUNT) -+ return -EINVAL; -+ -+ if (copy_from_user(fds, u64_to_user_ptr(args->objs), -+ array_size(count, sizeof(*fds)))) -+ return -EFAULT; -+ -+ q = kmalloc(struct_size(q, entries, count), GFP_KERNEL); -+ if (!q) -+ return -ENOMEM; -+ q->task = current; -+ atomic_set(&q->signaled, -1); -+ q->count = count; -+ -+ for (i = 0; i < count; i++) { -+ struct ntsync_q_entry *entry = &q->entries[i]; -+ struct ntsync_obj *obj = get_obj(dev, fds[i]); -+ -+ if (!obj) -+ goto err; -+ -+ entry->obj = obj; -+ entry->q = q; -+ entry->index = i; -+ } -+ -+ *ret_q = q; -+ return 0; -+ -+err: -+ for (j = 0; j < i; j++) -+ put_obj(q->entries[j].obj); -+ kfree(q); -+ return -EINVAL; -+} -+ -+static void try_wake_any_obj(struct ntsync_obj *obj) -+{ -+ switch (obj->type) { -+ case NTSYNC_TYPE_SEM: -+ try_wake_any_sem(obj); -+ break; -+ } -+} -+ -+static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) -+{ -+ struct ntsync_wait_args args; -+ struct ntsync_q *q; -+ int signaled; -+ __u32 i; -+ int ret; -+ -+ if (copy_from_user(&args, argp, sizeof(args))) -+ return -EFAULT; -+ -+ ret = setup_wait(dev, &args, &q); -+ if (ret < 0) -+ return ret; -+ -+ /* queue ourselves */ -+ -+ for (i = 0; i < args.count; i++) { -+ struct ntsync_q_entry *entry = &q->entries[i]; -+ struct ntsync_obj *obj = entry->obj; -+ -+ spin_lock(&obj->lock); -+ list_add_tail(&entry->node, &obj->any_waiters); -+ spin_unlock(&obj->lock); -+ } -+ -+ /* check if we are already signaled */ -+ -+ for (i = 0; i < args.count; i++) { -+ struct ntsync_obj *obj = q->entries[i].obj; -+ -+ if (atomic_read(&q->signaled) != -1) -+ break; -+ -+ spin_lock(&obj->lock); -+ try_wake_any_obj(obj); -+ spin_unlock(&obj->lock); -+ } -+ -+ /* sleep */ -+ -+ ret = ntsync_schedule(q, &args); -+ -+ /* and finally, unqueue */ -+ -+ for (i = 0; i < args.count; i++) { -+ struct ntsync_q_entry *entry = &q->entries[i]; -+ struct ntsync_obj *obj = entry->obj; -+ -+ spin_lock(&obj->lock); -+ list_del(&entry->node); -+ spin_unlock(&obj->lock); -+ -+ put_obj(obj); -+ } -+ -+ signaled = atomic_read(&q->signaled); -+ if (signaled != -1) { -+ struct ntsync_wait_args __user *user_args = argp; -+ -+ /* even if we caught a signal, we need to communicate success */ -+ ret = 0; -+ -+ if (put_user(signaled, &user_args->index)) -+ ret = -EFAULT; -+ } else if (!ret) { -+ ret = -ETIMEDOUT; -+ } -+ -+ kfree(q); -+ return ret; -+} -+ - static int ntsync_char_open(struct inode *inode, struct file *file) - { - struct ntsync_device *dev; -@@ -218,6 +461,8 @@ static long ntsync_char_ioctl(struct fil - switch (cmd) { - case NTSYNC_IOC_CREATE_SEM: - return ntsync_create_sem(dev, argp); -+ case NTSYNC_IOC_WAIT_ANY: -+ return ntsync_wait_any(dev, argp); - default: - return -ENOIOCTLCMD; - } ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -15,7 +15,21 @@ struct ntsync_sem_args { - __u32 max; - }; - -+#define NTSYNC_WAIT_REALTIME 0x1 -+ -+struct ntsync_wait_args { -+ __u64 timeout; -+ __u64 objs; -+ __u32 count; -+ __u32 index; -+ __u32 flags; -+ __u32 pad[3]; -+}; -+ -+#define NTSYNC_MAX_WAIT_COUNT 64 -+ - #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) -+#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) - - #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) - diff --git a/debian/patches/misc-ntsync7/0004-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch b/debian/patches/misc-ntsync7/0004-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch deleted file mode 100644 index 933498a..0000000 --- a/debian/patches/misc-ntsync7/0004-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch +++ /dev/null @@ -1,533 +0,0 @@ -From 6c1dac87ff835a48a067fe75bd0a6965921dac78 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:45 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ALL. -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This is similar to NTSYNC_IOC_WAIT_ANY, but waits until all of the objects are -simultaneously signaled, and then acquires all of them as a single atomic -operation. - -Because acquisition of multiple objects is atomic, some complex locking is -required. We cannot simply spin-lock multiple objects simultaneously, as that -may disable preëmption for a problematically long time. - -Instead, modifying any object which may be involved in a wait-all operation takes -a device-wide sleeping mutex, "wait_all_lock", instead of the normal object -spinlock. - -Because wait-for-all is a rare operation, in order to optimize wait-for-any, -this lock is only taken when necessary. "all_hint" is used to mark objects which -are involved in a wait-for-all operation, and if an object is not, only its -spinlock is taken. - -The locking scheme used here was written by Peter Zijlstra. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 336 ++++++++++++++++++++++++++++++++++-- - include/uapi/linux/ntsync.h | 1 + - 2 files changed, 323 insertions(+), 14 deletions(-) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -13,6 +13,7 @@ - #include <linux/ktime.h> - #include <linux/miscdevice.h> - #include <linux/module.h> -+#include <linux/mutex.h> - #include <linux/overflow.h> - #include <linux/sched.h> - #include <linux/sched/signal.h> -@@ -41,6 +42,7 @@ enum ntsync_type { - - struct ntsync_obj { - spinlock_t lock; -+ int dev_locked; - - enum ntsync_type type; - -@@ -55,7 +57,30 @@ struct ntsync_obj { - } sem; - } u; - -+ /* -+ * any_waiters is protected by the object lock, but all_waiters is -+ * protected by the device wait_all_lock. -+ */ - struct list_head any_waiters; -+ struct list_head all_waiters; -+ -+ /* -+ * Hint describing how many tasks are queued on this object in a -+ * wait-all operation. -+ * -+ * Any time we do a wake, we may need to wake "all" waiters as well as -+ * "any" waiters. In order to atomically wake "all" waiters, we must -+ * lock all of the objects, and that means grabbing the wait_all_lock -+ * below (and, due to lock ordering rules, before locking this object). -+ * However, wait-all is a rare operation, and grabbing the wait-all -+ * lock for every wake would create unnecessary contention. -+ * Therefore we first check whether all_hint is zero, and, if it is, -+ * we skip trying to wake "all" waiters. -+ * -+ * Since wait requests must originate from user-space threads, we're -+ * limited here by PID_MAX_LIMIT, so there's no risk of overflow. -+ */ -+ atomic_t all_hint; - }; - - struct ntsync_q_entry { -@@ -75,19 +100,198 @@ struct ntsync_q { - */ - atomic_t signaled; - -+ bool all; - __u32 count; - struct ntsync_q_entry entries[]; - }; - - struct ntsync_device { -+ /* -+ * Wait-all operations must atomically grab all objects, and be totally -+ * ordered with respect to each other and wait-any operations. -+ * If one thread is trying to acquire several objects, another thread -+ * cannot touch the object at the same time. -+ * -+ * This device-wide lock is used to serialize wait-for-all -+ * operations, and operations on an object that is involved in a -+ * wait-for-all. -+ */ -+ struct mutex wait_all_lock; -+ - struct file *file; - }; - -+/* -+ * Single objects are locked using obj->lock. -+ * -+ * Multiple objects are 'locked' while holding dev->wait_all_lock. -+ * In this case however, individual objects are not locked by holding -+ * obj->lock, but by setting obj->dev_locked. -+ * -+ * This means that in order to lock a single object, the sequence is slightly -+ * more complicated than usual. Specifically it needs to check obj->dev_locked -+ * after acquiring obj->lock, if set, it needs to drop the lock and acquire -+ * dev->wait_all_lock in order to serialize against the multi-object operation. -+ */ -+ -+static void dev_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) -+{ -+ lockdep_assert_held(&dev->wait_all_lock); -+ lockdep_assert(obj->dev == dev); -+ spin_lock(&obj->lock); -+ /* -+ * By setting obj->dev_locked inside obj->lock, it is ensured that -+ * anyone holding obj->lock must see the value. -+ */ -+ obj->dev_locked = 1; -+ spin_unlock(&obj->lock); -+} -+ -+static void dev_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) -+{ -+ lockdep_assert_held(&dev->wait_all_lock); -+ lockdep_assert(obj->dev == dev); -+ spin_lock(&obj->lock); -+ obj->dev_locked = 0; -+ spin_unlock(&obj->lock); -+} -+ -+static void obj_lock(struct ntsync_obj *obj) -+{ -+ struct ntsync_device *dev = obj->dev; -+ -+ for (;;) { -+ spin_lock(&obj->lock); -+ if (likely(!obj->dev_locked)) -+ break; -+ -+ spin_unlock(&obj->lock); -+ mutex_lock(&dev->wait_all_lock); -+ spin_lock(&obj->lock); -+ /* -+ * obj->dev_locked should be set and released under the same -+ * wait_all_lock section, since we now own this lock, it should -+ * be clear. -+ */ -+ lockdep_assert(!obj->dev_locked); -+ spin_unlock(&obj->lock); -+ mutex_unlock(&dev->wait_all_lock); -+ } -+} -+ -+static void obj_unlock(struct ntsync_obj *obj) -+{ -+ spin_unlock(&obj->lock); -+} -+ -+static bool ntsync_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) -+{ -+ bool all; -+ -+ obj_lock(obj); -+ all = atomic_read(&obj->all_hint); -+ if (unlikely(all)) { -+ obj_unlock(obj); -+ mutex_lock(&dev->wait_all_lock); -+ dev_lock_obj(dev, obj); -+ } -+ -+ return all; -+} -+ -+static void ntsync_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj, bool all) -+{ -+ if (all) { -+ dev_unlock_obj(dev, obj); -+ mutex_unlock(&dev->wait_all_lock); -+ } else { -+ obj_unlock(obj); -+ } -+} -+ -+#define ntsync_assert_held(obj) \ -+ lockdep_assert((lockdep_is_held(&(obj)->lock) != LOCK_STATE_NOT_HELD) || \ -+ ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \ -+ (obj)->dev_locked)) -+ -+static bool is_signaled(struct ntsync_obj *obj) -+{ -+ ntsync_assert_held(obj); -+ -+ switch (obj->type) { -+ case NTSYNC_TYPE_SEM: -+ return !!obj->u.sem.count; -+ } -+ -+ WARN(1, "bad object type %#x\n", obj->type); -+ return false; -+} -+ -+/* -+ * "locked_obj" is an optional pointer to an object which is already locked and -+ * should not be locked again. This is necessary so that changing an object's -+ * state and waking it can be a single atomic operation. -+ */ -+static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q, -+ struct ntsync_obj *locked_obj) -+{ -+ __u32 count = q->count; -+ bool can_wake = true; -+ int signaled = -1; -+ __u32 i; -+ -+ lockdep_assert_held(&dev->wait_all_lock); -+ if (locked_obj) -+ lockdep_assert(locked_obj->dev_locked); -+ -+ for (i = 0; i < count; i++) { -+ if (q->entries[i].obj != locked_obj) -+ dev_lock_obj(dev, q->entries[i].obj); -+ } -+ -+ for (i = 0; i < count; i++) { -+ if (!is_signaled(q->entries[i].obj)) { -+ can_wake = false; -+ break; -+ } -+ } -+ -+ if (can_wake && atomic_try_cmpxchg(&q->signaled, &signaled, 0)) { -+ for (i = 0; i < count; i++) { -+ struct ntsync_obj *obj = q->entries[i].obj; -+ -+ switch (obj->type) { -+ case NTSYNC_TYPE_SEM: -+ obj->u.sem.count--; -+ break; -+ } -+ } -+ wake_up_process(q->task); -+ } -+ -+ for (i = 0; i < count; i++) { -+ if (q->entries[i].obj != locked_obj) -+ dev_unlock_obj(dev, q->entries[i].obj); -+ } -+} -+ -+static void try_wake_all_obj(struct ntsync_device *dev, struct ntsync_obj *obj) -+{ -+ struct ntsync_q_entry *entry; -+ -+ lockdep_assert_held(&dev->wait_all_lock); -+ lockdep_assert(obj->dev_locked); -+ -+ list_for_each_entry(entry, &obj->all_waiters, node) -+ try_wake_all(dev, entry->q, obj); -+} -+ - static void try_wake_any_sem(struct ntsync_obj *sem) - { - struct ntsync_q_entry *entry; - -- lockdep_assert_held(&sem->lock); -+ ntsync_assert_held(sem); -+ lockdep_assert(sem->type == NTSYNC_TYPE_SEM); - - list_for_each_entry(entry, &sem->any_waiters, node) { - struct ntsync_q *q = entry->q; -@@ -111,7 +315,7 @@ static int release_sem_state(struct ntsy - { - __u32 sum; - -- lockdep_assert_held(&sem->lock); -+ ntsync_assert_held(sem); - - if (check_add_overflow(sem->u.sem.count, count, &sum) || - sum > sem->u.sem.max) -@@ -123,9 +327,11 @@ static int release_sem_state(struct ntsy - - static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp) - { -+ struct ntsync_device *dev = sem->dev; - __u32 __user *user_args = argp; - __u32 prev_count; - __u32 args; -+ bool all; - int ret; - - if (copy_from_user(&args, argp, sizeof(args))) -@@ -134,14 +340,17 @@ static int ntsync_sem_release(struct nts - if (sem->type != NTSYNC_TYPE_SEM) - return -EINVAL; - -- spin_lock(&sem->lock); -+ all = ntsync_lock_obj(dev, sem); - - prev_count = sem->u.sem.count; -- ret = post_sem_state(sem, args); -- if (!ret) -+ ret = release_sem_state(sem, args); -+ if (!ret) { -+ if (all) -+ try_wake_all_obj(dev, sem); - try_wake_any_sem(sem); -+ } - -- spin_unlock(&sem->lock); -+ ntsync_unlock_obj(dev, sem, all); - - if (!ret && put_user(prev_count, user_args)) - ret = -EFAULT; -@@ -193,6 +402,8 @@ static struct ntsync_obj *ntsync_alloc_o - get_file(dev->file); - spin_lock_init(&obj->lock); - INIT_LIST_HEAD(&obj->any_waiters); -+ INIT_LIST_HEAD(&obj->all_waiters); -+ atomic_set(&obj->all_hint, 0); - - return obj; - } -@@ -301,7 +512,7 @@ static int ntsync_schedule(const struct - * Allocate and initialize the ntsync_q structure, but do not queue us yet. - */ - static int setup_wait(struct ntsync_device *dev, -- const struct ntsync_wait_args *args, -+ const struct ntsync_wait_args *args, bool all, - struct ntsync_q **ret_q) - { - const __u32 count = args->count; -@@ -324,6 +535,7 @@ static int setup_wait(struct ntsync_devi - return -ENOMEM; - q->task = current; - atomic_set(&q->signaled, -1); -+ q->all = all; - q->count = count; - - for (i = 0; i < count; i++) { -@@ -333,6 +545,16 @@ static int setup_wait(struct ntsync_devi - if (!obj) - goto err; - -+ if (all) { -+ /* Check that the objects are all distinct. */ -+ for (j = 0; j < i; j++) { -+ if (obj == q->entries[j].obj) { -+ put_obj(obj); -+ goto err; -+ } -+ } -+ } -+ - entry->obj = obj; - entry->q = q; - entry->index = i; -@@ -362,13 +584,14 @@ static int ntsync_wait_any(struct ntsync - struct ntsync_wait_args args; - struct ntsync_q *q; - int signaled; -+ bool all; - __u32 i; - int ret; - - if (copy_from_user(&args, argp, sizeof(args))) - return -EFAULT; - -- ret = setup_wait(dev, &args, &q); -+ ret = setup_wait(dev, &args, false, &q); - if (ret < 0) - return ret; - -@@ -378,9 +601,9 @@ static int ntsync_wait_any(struct ntsync - struct ntsync_q_entry *entry = &q->entries[i]; - struct ntsync_obj *obj = entry->obj; - -- spin_lock(&obj->lock); -+ all = ntsync_lock_obj(dev, obj); - list_add_tail(&entry->node, &obj->any_waiters); -- spin_unlock(&obj->lock); -+ ntsync_unlock_obj(dev, obj, all); - } - - /* check if we are already signaled */ -@@ -391,9 +614,9 @@ static int ntsync_wait_any(struct ntsync - if (atomic_read(&q->signaled) != -1) - break; - -- spin_lock(&obj->lock); -+ all = ntsync_lock_obj(dev, obj); - try_wake_any_obj(obj); -- spin_unlock(&obj->lock); -+ ntsync_unlock_obj(dev, obj, all); - } - - /* sleep */ -@@ -406,13 +629,94 @@ static int ntsync_wait_any(struct ntsync - struct ntsync_q_entry *entry = &q->entries[i]; - struct ntsync_obj *obj = entry->obj; - -- spin_lock(&obj->lock); -+ all = ntsync_lock_obj(dev, obj); - list_del(&entry->node); -- spin_unlock(&obj->lock); -+ ntsync_unlock_obj(dev, obj, all); -+ -+ put_obj(obj); -+ } -+ -+ signaled = atomic_read(&q->signaled); -+ if (signaled != -1) { -+ struct ntsync_wait_args __user *user_args = argp; -+ -+ /* even if we caught a signal, we need to communicate success */ -+ ret = 0; -+ -+ if (put_user(signaled, &user_args->index)) -+ ret = -EFAULT; -+ } else if (!ret) { -+ ret = -ETIMEDOUT; -+ } -+ -+ kfree(q); -+ return ret; -+} -+ -+static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp) -+{ -+ struct ntsync_wait_args args; -+ struct ntsync_q *q; -+ int signaled; -+ __u32 i; -+ int ret; -+ -+ if (copy_from_user(&args, argp, sizeof(args))) -+ return -EFAULT; -+ -+ ret = setup_wait(dev, &args, true, &q); -+ if (ret < 0) -+ return ret; -+ -+ /* queue ourselves */ -+ -+ mutex_lock(&dev->wait_all_lock); -+ -+ for (i = 0; i < args.count; i++) { -+ struct ntsync_q_entry *entry = &q->entries[i]; -+ struct ntsync_obj *obj = entry->obj; -+ -+ atomic_inc(&obj->all_hint); -+ -+ /* -+ * obj->all_waiters is protected by dev->wait_all_lock rather -+ * than obj->lock, so there is no need to acquire obj->lock -+ * here. -+ */ -+ list_add_tail(&entry->node, &obj->all_waiters); -+ } -+ -+ /* check if we are already signaled */ -+ -+ try_wake_all(dev, q, NULL); -+ -+ mutex_unlock(&dev->wait_all_lock); -+ -+ /* sleep */ -+ -+ ret = ntsync_schedule(q, &args); -+ -+ /* and finally, unqueue */ -+ -+ mutex_lock(&dev->wait_all_lock); -+ -+ for (i = 0; i < args.count; i++) { -+ struct ntsync_q_entry *entry = &q->entries[i]; -+ struct ntsync_obj *obj = entry->obj; -+ -+ /* -+ * obj->all_waiters is protected by dev->wait_all_lock rather -+ * than obj->lock, so there is no need to acquire it here. -+ */ -+ list_del(&entry->node); -+ -+ atomic_dec(&obj->all_hint); - - put_obj(obj); - } - -+ mutex_unlock(&dev->wait_all_lock); -+ - signaled = atomic_read(&q->signaled); - if (signaled != -1) { - struct ntsync_wait_args __user *user_args = argp; -@@ -438,6 +742,8 @@ static int ntsync_char_open(struct inode - if (!dev) - return -ENOMEM; - -+ mutex_init(&dev->wait_all_lock); -+ - file->private_data = dev; - dev->file = file; - return nonseekable_open(inode, file); -@@ -461,6 +767,8 @@ static long ntsync_char_ioctl(struct fil - switch (cmd) { - case NTSYNC_IOC_CREATE_SEM: - return ntsync_create_sem(dev, argp); -+ case NTSYNC_IOC_WAIT_ALL: -+ return ntsync_wait_all(dev, argp); - case NTSYNC_IOC_WAIT_ANY: - return ntsync_wait_any(dev, argp); - default: ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -30,6 +30,7 @@ struct ntsync_wait_args { - - #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) - #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) -+#define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) - - #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) - diff --git a/debian/patches/misc-ntsync7/0005-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch b/debian/patches/misc-ntsync7/0005-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch deleted file mode 100644 index ddc6202..0000000 --- a/debian/patches/misc-ntsync7/0005-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch +++ /dev/null @@ -1,222 +0,0 @@ -From bcdeaefdc4b60e7845232c201427717df3a83277 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:46 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_CREATE_MUTEX. - -This corresponds to the NT syscall NtCreateMutant(). - -An NT mutex is recursive, with a 32-bit recursion counter. When acquired via -NtWaitForMultipleObjects(), the recursion counter is incremented by one. The OS -records the thread which acquired it. - -The OS records the thread which acquired it. However, in order to keep this -driver self-contained, the owning thread ID is managed by user-space, and passed -as a parameter to all relevant ioctls. - -The initial owner and recursion count, if any, are specified when the mutex is -created. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 74 +++++++++++++++++++++++++++++++++++-- - include/uapi/linux/ntsync.h | 9 ++++- - 2 files changed, 79 insertions(+), 4 deletions(-) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -25,6 +25,7 @@ - - enum ntsync_type { - NTSYNC_TYPE_SEM, -+ NTSYNC_TYPE_MUTEX, - }; - - /* -@@ -55,6 +56,10 @@ struct ntsync_obj { - __u32 count; - __u32 max; - } sem; -+ struct { -+ __u32 count; -+ pid_t owner; -+ } mutex; - } u; - - /* -@@ -92,6 +97,7 @@ struct ntsync_q_entry { - - struct ntsync_q { - struct task_struct *task; -+ __u32 owner; - - /* - * Protected via atomic_try_cmpxchg(). Only the thread that wins the -@@ -214,13 +220,17 @@ static void ntsync_unlock_obj(struct nts - ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \ - (obj)->dev_locked)) - --static bool is_signaled(struct ntsync_obj *obj) -+static bool is_signaled(struct ntsync_obj *obj, __u32 owner) - { - ntsync_assert_held(obj); - - switch (obj->type) { - case NTSYNC_TYPE_SEM: - return !!obj->u.sem.count; -+ case NTSYNC_TYPE_MUTEX: -+ if (obj->u.mutex.owner && obj->u.mutex.owner != owner) -+ return false; -+ return obj->u.mutex.count < UINT_MAX; - } - - WARN(1, "bad object type %#x\n", obj->type); -@@ -250,7 +260,7 @@ static void try_wake_all(struct ntsync_d - } - - for (i = 0; i < count; i++) { -- if (!is_signaled(q->entries[i].obj)) { -+ if (!is_signaled(q->entries[i].obj, q->owner)) { - can_wake = false; - break; - } -@@ -264,6 +274,10 @@ static void try_wake_all(struct ntsync_d - case NTSYNC_TYPE_SEM: - obj->u.sem.count--; - break; -+ case NTSYNC_TYPE_MUTEX: -+ obj->u.mutex.count++; -+ obj->u.mutex.owner = q->owner; -+ break; - } - } - wake_up_process(q->task); -@@ -307,6 +321,30 @@ static void try_wake_any_sem(struct ntsy - } - } - -+static void try_wake_any_mutex(struct ntsync_obj *mutex) -+{ -+ struct ntsync_q_entry *entry; -+ -+ ntsync_assert_held(mutex); -+ lockdep_assert(mutex->type == NTSYNC_TYPE_MUTEX); -+ -+ list_for_each_entry(entry, &mutex->any_waiters, node) { -+ struct ntsync_q *q = entry->q; -+ int signaled = -1; -+ -+ if (mutex->u.mutex.count == UINT_MAX) -+ break; -+ if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner) -+ continue; -+ -+ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { -+ mutex->u.mutex.count++; -+ mutex->u.mutex.owner = q->owner; -+ wake_up_process(q->task); -+ } -+ } -+} -+ - /* - * Actually change the semaphore state, returning -EOVERFLOW if it is made - * invalid. -@@ -451,6 +489,30 @@ static int ntsync_create_sem(struct ntsy - return fd; - } - -+static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp) -+{ -+ struct ntsync_mutex_args args; -+ struct ntsync_obj *mutex; -+ int fd; -+ -+ if (copy_from_user(&args, argp, sizeof(args))) -+ return -EFAULT; -+ -+ if (!args.owner != !args.count) -+ return -EINVAL; -+ -+ mutex = ntsync_alloc_obj(dev, NTSYNC_TYPE_MUTEX); -+ if (!mutex) -+ return -ENOMEM; -+ mutex->u.mutex.count = args.count; -+ mutex->u.mutex.owner = args.owner; -+ fd = ntsync_obj_get_fd(mutex); -+ if (fd < 0) -+ kfree(mutex); -+ -+ return fd; -+} -+ - static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) - { - struct file *file = fget(fd); -@@ -520,7 +582,7 @@ static int setup_wait(struct ntsync_devi - struct ntsync_q *q; - __u32 i, j; - -- if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME)) -+ if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME)) - return -EINVAL; - - if (args->count > NTSYNC_MAX_WAIT_COUNT) -@@ -534,6 +596,7 @@ static int setup_wait(struct ntsync_devi - if (!q) - return -ENOMEM; - q->task = current; -+ q->owner = args->owner; - atomic_set(&q->signaled, -1); - q->all = all; - q->count = count; -@@ -576,6 +639,9 @@ static void try_wake_any_obj(struct ntsy - case NTSYNC_TYPE_SEM: - try_wake_any_sem(obj); - break; -+ case NTSYNC_TYPE_MUTEX: -+ try_wake_any_mutex(obj); -+ break; - } - } - -@@ -765,6 +831,8 @@ static long ntsync_char_ioctl(struct fil - void __user *argp = (void __user *)parm; - - switch (cmd) { -+ case NTSYNC_IOC_CREATE_MUTEX: -+ return ntsync_create_mutex(dev, argp); - case NTSYNC_IOC_CREATE_SEM: - return ntsync_create_sem(dev, argp); - case NTSYNC_IOC_WAIT_ALL: ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -15,6 +15,11 @@ struct ntsync_sem_args { - __u32 max; - }; - -+struct ntsync_mutex_args { -+ __u32 owner; -+ __u32 count; -+}; -+ - #define NTSYNC_WAIT_REALTIME 0x1 - - struct ntsync_wait_args { -@@ -23,7 +28,8 @@ struct ntsync_wait_args { - __u32 count; - __u32 index; - __u32 flags; -- __u32 pad[3]; -+ __u32 owner; -+ __u32 pad[2]; - }; - - #define NTSYNC_MAX_WAIT_COUNT 64 -@@ -31,6 +37,7 @@ struct ntsync_wait_args { - #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) - #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) - #define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) -+#define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) - - #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) - diff --git a/debian/patches/misc-ntsync7/0006-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch b/debian/patches/misc-ntsync7/0006-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch deleted file mode 100644 index 5c10875..0000000 --- a/debian/patches/misc-ntsync7/0006-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch +++ /dev/null @@ -1,95 +0,0 @@ -From e349279c9dc7fc2136a764a16074a90ef3039f38 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:47 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_UNLOCK. - -This corresponds to the NT syscall NtReleaseMutant(). - -This syscall decrements the mutex's recursion count by one, and returns the -previous value. If the mutex is not owned by the current task, the function -instead fails and returns -EPERM. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 53 +++++++++++++++++++++++++++++++++++++ - include/uapi/linux/ntsync.h | 1 + - 2 files changed, 54 insertions(+) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -396,6 +396,57 @@ static int ntsync_sem_release(struct nts - return ret; - } - -+/* -+ * Actually change the mutex state, returning -EPERM if not the owner. -+ */ -+static int unlock_mutex_state(struct ntsync_obj *mutex, -+ const struct ntsync_mutex_args *args) -+{ -+ ntsync_assert_held(mutex); -+ -+ if (mutex->u.mutex.owner != args->owner) -+ return -EPERM; -+ -+ if (!--mutex->u.mutex.count) -+ mutex->u.mutex.owner = 0; -+ return 0; -+} -+ -+static int ntsync_mutex_unlock(struct ntsync_obj *mutex, void __user *argp) -+{ -+ struct ntsync_mutex_args __user *user_args = argp; -+ struct ntsync_device *dev = mutex->dev; -+ struct ntsync_mutex_args args; -+ __u32 prev_count; -+ bool all; -+ int ret; -+ -+ if (copy_from_user(&args, argp, sizeof(args))) -+ return -EFAULT; -+ if (!args.owner) -+ return -EINVAL; -+ -+ if (mutex->type != NTSYNC_TYPE_MUTEX) -+ return -EINVAL; -+ -+ all = ntsync_lock_obj(dev, mutex); -+ -+ prev_count = mutex->u.mutex.count; -+ ret = unlock_mutex_state(mutex, &args); -+ if (!ret) { -+ if (all) -+ try_wake_all_obj(dev, mutex); -+ try_wake_any_mutex(mutex); -+ } -+ -+ ntsync_unlock_obj(dev, mutex, all); -+ -+ if (!ret && put_user(prev_count, &user_args->count)) -+ ret = -EFAULT; -+ -+ return ret; -+} -+ - static int ntsync_obj_release(struct inode *inode, struct file *file) - { - struct ntsync_obj *obj = file->private_data; -@@ -415,6 +466,8 @@ static long ntsync_obj_ioctl(struct file - switch (cmd) { - case NTSYNC_IOC_SEM_RELEASE: - return ntsync_sem_release(obj, argp); -+ case NTSYNC_IOC_MUTEX_UNLOCK: -+ return ntsync_mutex_unlock(obj, argp); - default: - return -ENOIOCTLCMD; - } ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -40,5 +40,6 @@ struct ntsync_wait_args { - #define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) - - #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) -+#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) - - #endif diff --git a/debian/patches/misc-ntsync7/0007-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch b/debian/patches/misc-ntsync7/0007-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch deleted file mode 100644 index 9befec2..0000000 --- a/debian/patches/misc-ntsync7/0007-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch +++ /dev/null @@ -1,156 +0,0 @@ -From ebb60a10ac3c6b28ba7a46aa67b279d41ad9356d Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:48 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_KILL. - -This does not correspond to any NT syscall. Rather, when a thread dies, it -should be called by the NT emulator for each mutex, with the TID of the dying -thread. - -NT mutexes are robust (in the pthread sense). When an NT thread dies, any -mutexes it owned are immediately released. Acquisition of those mutexes by other -threads will return a special value indicating that the mutex was abandoned, -like EOWNERDEAD returned from pthread_mutex_lock(), and EOWNERDEAD is indeed -used here for that purpose. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 61 +++++++++++++++++++++++++++++++++++-- - include/uapi/linux/ntsync.h | 1 + - 2 files changed, 60 insertions(+), 2 deletions(-) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -59,6 +59,7 @@ struct ntsync_obj { - struct { - __u32 count; - pid_t owner; -+ bool ownerdead; - } mutex; - } u; - -@@ -107,6 +108,7 @@ struct ntsync_q { - atomic_t signaled; - - bool all; -+ bool ownerdead; - __u32 count; - struct ntsync_q_entry entries[]; - }; -@@ -275,6 +277,9 @@ static void try_wake_all(struct ntsync_d - obj->u.sem.count--; - break; - case NTSYNC_TYPE_MUTEX: -+ if (obj->u.mutex.ownerdead) -+ q->ownerdead = true; -+ obj->u.mutex.ownerdead = false; - obj->u.mutex.count++; - obj->u.mutex.owner = q->owner; - break; -@@ -338,6 +343,9 @@ static void try_wake_any_mutex(struct nt - continue; - - if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { -+ if (mutex->u.mutex.ownerdead) -+ q->ownerdead = true; -+ mutex->u.mutex.ownerdead = false; - mutex->u.mutex.count++; - mutex->u.mutex.owner = q->owner; - wake_up_process(q->task); -@@ -447,6 +455,52 @@ static int ntsync_mutex_unlock(struct nt - return ret; - } - -+/* -+ * Actually change the mutex state to mark its owner as dead, -+ * returning -EPERM if not the owner. -+ */ -+static int kill_mutex_state(struct ntsync_obj *mutex, __u32 owner) -+{ -+ ntsync_assert_held(mutex); -+ -+ if (mutex->u.mutex.owner != owner) -+ return -EPERM; -+ -+ mutex->u.mutex.ownerdead = true; -+ mutex->u.mutex.owner = 0; -+ mutex->u.mutex.count = 0; -+ return 0; -+} -+ -+static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp) -+{ -+ struct ntsync_device *dev = mutex->dev; -+ __u32 owner; -+ bool all; -+ int ret; -+ -+ if (get_user(owner, (__u32 __user *)argp)) -+ return -EFAULT; -+ if (!owner) -+ return -EINVAL; -+ -+ if (mutex->type != NTSYNC_TYPE_MUTEX) -+ return -EINVAL; -+ -+ all = ntsync_lock_obj(dev, mutex); -+ -+ ret = kill_mutex_state(mutex, owner); -+ if (!ret) { -+ if (all) -+ try_wake_all_obj(dev, mutex); -+ try_wake_any_mutex(mutex); -+ } -+ -+ ntsync_unlock_obj(dev, mutex, all); -+ -+ return ret; -+} -+ - static int ntsync_obj_release(struct inode *inode, struct file *file) - { - struct ntsync_obj *obj = file->private_data; -@@ -468,6 +522,8 @@ static long ntsync_obj_ioctl(struct file - return ntsync_sem_release(obj, argp); - case NTSYNC_IOC_MUTEX_UNLOCK: - return ntsync_mutex_unlock(obj, argp); -+ case NTSYNC_IOC_MUTEX_KILL: -+ return ntsync_mutex_kill(obj, argp); - default: - return -ENOIOCTLCMD; - } -@@ -652,6 +708,7 @@ static int setup_wait(struct ntsync_devi - q->owner = args->owner; - atomic_set(&q->signaled, -1); - q->all = all; -+ q->ownerdead = false; - q->count = count; - - for (i = 0; i < count; i++) { -@@ -760,7 +817,7 @@ static int ntsync_wait_any(struct ntsync - struct ntsync_wait_args __user *user_args = argp; - - /* even if we caught a signal, we need to communicate success */ -- ret = 0; -+ ret = q->ownerdead ? -EOWNERDEAD : 0; - - if (put_user(signaled, &user_args->index)) - ret = -EFAULT; -@@ -841,7 +898,7 @@ static int ntsync_wait_all(struct ntsync - struct ntsync_wait_args __user *user_args = argp; - - /* even if we caught a signal, we need to communicate success */ -- ret = 0; -+ ret = q->ownerdead ? -EOWNERDEAD : 0; - - if (put_user(signaled, &user_args->index)) - ret = -EFAULT; ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -41,5 +41,6 @@ struct ntsync_wait_args { - - #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) - #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) -+#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) - - #endif diff --git a/debian/patches/misc-ntsync7/0008-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch b/debian/patches/misc-ntsync7/0008-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch deleted file mode 100644 index 0714942..0000000 --- a/debian/patches/misc-ntsync7/0008-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch +++ /dev/null @@ -1,162 +0,0 @@ -From f74c8259d49ea4c0e679902da9c7c95ec06ae65c Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:49 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_CREATE_EVENT. - -This correspond to the NT syscall NtCreateEvent(). - -An NT event holds a single bit of state denoting whether it is signaled or -unsignaled. - -There are two types of events: manual-reset and automatic-reset. When an -automatic-reset event is acquired via a wait function, its state is reset to -unsignaled. Manual-reset events are not affected by wait functions. - -Whether the event is manual-reset, and its initial state, are specified at -creation time. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 59 +++++++++++++++++++++++++++++++++++++ - include/uapi/linux/ntsync.h | 6 ++++ - 2 files changed, 65 insertions(+) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -26,6 +26,7 @@ - enum ntsync_type { - NTSYNC_TYPE_SEM, - NTSYNC_TYPE_MUTEX, -+ NTSYNC_TYPE_EVENT, - }; - - /* -@@ -61,6 +62,10 @@ struct ntsync_obj { - pid_t owner; - bool ownerdead; - } mutex; -+ struct { -+ bool manual; -+ bool signaled; -+ } event; - } u; - - /* -@@ -233,6 +238,8 @@ static bool is_signaled(struct ntsync_ob - if (obj->u.mutex.owner && obj->u.mutex.owner != owner) - return false; - return obj->u.mutex.count < UINT_MAX; -+ case NTSYNC_TYPE_EVENT: -+ return obj->u.event.signaled; - } - - WARN(1, "bad object type %#x\n", obj->type); -@@ -283,6 +290,10 @@ static void try_wake_all(struct ntsync_d - obj->u.mutex.count++; - obj->u.mutex.owner = q->owner; - break; -+ case NTSYNC_TYPE_EVENT: -+ if (!obj->u.event.manual) -+ obj->u.event.signaled = false; -+ break; - } - } - wake_up_process(q->task); -@@ -353,6 +364,28 @@ static void try_wake_any_mutex(struct nt - } - } - -+static void try_wake_any_event(struct ntsync_obj *event) -+{ -+ struct ntsync_q_entry *entry; -+ -+ ntsync_assert_held(event); -+ lockdep_assert(event->type == NTSYNC_TYPE_EVENT); -+ -+ list_for_each_entry(entry, &event->any_waiters, node) { -+ struct ntsync_q *q = entry->q; -+ int signaled = -1; -+ -+ if (!event->u.event.signaled) -+ break; -+ -+ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { -+ if (!event->u.event.manual) -+ event->u.event.signaled = false; -+ wake_up_process(q->task); -+ } -+ } -+} -+ - /* - * Actually change the semaphore state, returning -EOVERFLOW if it is made - * invalid. -@@ -622,6 +655,27 @@ static int ntsync_create_mutex(struct nt - return fd; - } - -+static int ntsync_create_event(struct ntsync_device *dev, void __user *argp) -+{ -+ struct ntsync_event_args args; -+ struct ntsync_obj *event; -+ int fd; -+ -+ if (copy_from_user(&args, argp, sizeof(args))) -+ return -EFAULT; -+ -+ event = ntsync_alloc_obj(dev, NTSYNC_TYPE_EVENT); -+ if (!event) -+ return -ENOMEM; -+ event->u.event.manual = args.manual; -+ event->u.event.signaled = args.signaled; -+ fd = ntsync_obj_get_fd(event); -+ if (fd < 0) -+ kfree(event); -+ -+ return fd; -+} -+ - static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) - { - struct file *file = fget(fd); -@@ -752,6 +806,9 @@ static void try_wake_any_obj(struct ntsy - case NTSYNC_TYPE_MUTEX: - try_wake_any_mutex(obj); - break; -+ case NTSYNC_TYPE_EVENT: -+ try_wake_any_event(obj); -+ break; - } - } - -@@ -941,6 +998,8 @@ static long ntsync_char_ioctl(struct fil - void __user *argp = (void __user *)parm; - - switch (cmd) { -+ case NTSYNC_IOC_CREATE_EVENT: -+ return ntsync_create_event(dev, argp); - case NTSYNC_IOC_CREATE_MUTEX: - return ntsync_create_mutex(dev, argp); - case NTSYNC_IOC_CREATE_SEM: ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -20,6 +20,11 @@ struct ntsync_mutex_args { - __u32 count; - }; - -+struct ntsync_event_args { -+ __u32 manual; -+ __u32 signaled; -+}; -+ - #define NTSYNC_WAIT_REALTIME 0x1 - - struct ntsync_wait_args { -@@ -38,6 +43,7 @@ struct ntsync_wait_args { - #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) - #define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) - #define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) -+#define NTSYNC_IOC_CREATE_EVENT _IOW ('N', 0x87, struct ntsync_event_args) - - #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) - #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) diff --git a/debian/patches/misc-ntsync7/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch b/debian/patches/misc-ntsync7/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch deleted file mode 100644 index abeca7f..0000000 --- a/debian/patches/misc-ntsync7/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch +++ /dev/null @@ -1,67 +0,0 @@ -From bf60db9cfeccc8f92636b6dcf2eccd7fcd8d84f3 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:50 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_EVENT_SET. - -This corresponds to the NT syscall NtSetEvent(). - -This sets the event to the signaled state, and returns its previous state. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 27 +++++++++++++++++++++++++++ - include/uapi/linux/ntsync.h | 1 + - 2 files changed, 28 insertions(+) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -534,6 +534,31 @@ static int ntsync_mutex_kill(struct ntsy - return ret; - } - -+static int ntsync_event_set(struct ntsync_obj *event, void __user *argp) -+{ -+ struct ntsync_device *dev = event->dev; -+ __u32 prev_state; -+ bool all; -+ -+ if (event->type != NTSYNC_TYPE_EVENT) -+ return -EINVAL; -+ -+ all = ntsync_lock_obj(dev, event); -+ -+ prev_state = event->u.event.signaled; -+ event->u.event.signaled = true; -+ if (all) -+ try_wake_all_obj(dev, event); -+ try_wake_any_event(event); -+ -+ ntsync_unlock_obj(dev, event, all); -+ -+ if (put_user(prev_state, (__u32 __user *)argp)) -+ return -EFAULT; -+ -+ return 0; -+} -+ - static int ntsync_obj_release(struct inode *inode, struct file *file) - { - struct ntsync_obj *obj = file->private_data; -@@ -557,6 +582,8 @@ static long ntsync_obj_ioctl(struct file - return ntsync_mutex_unlock(obj, argp); - case NTSYNC_IOC_MUTEX_KILL: - return ntsync_mutex_kill(obj, argp); -+ case NTSYNC_IOC_EVENT_SET: -+ return ntsync_event_set(obj, argp); - default: - return -ENOIOCTLCMD; - } ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -48,5 +48,6 @@ struct ntsync_wait_args { - #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) - #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) - #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) -+#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) - - #endif diff --git a/debian/patches/misc-ntsync7/0010-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch b/debian/patches/misc-ntsync7/0010-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch deleted file mode 100644 index 8a7a580..0000000 --- a/debian/patches/misc-ntsync7/0010-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch +++ /dev/null @@ -1,64 +0,0 @@ -From f2de3c99a840cac45446515dd268cb9d64f9f892 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:51 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_EVENT_RESET. - -This corresponds to the NT syscall NtResetEvent(). - -This sets the event to the unsignaled state, and returns its previous state. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 24 ++++++++++++++++++++++++ - include/uapi/linux/ntsync.h | 1 + - 2 files changed, 25 insertions(+) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -559,6 +559,28 @@ static int ntsync_event_set(struct ntsyn - return 0; - } - -+static int ntsync_event_reset(struct ntsync_obj *event, void __user *argp) -+{ -+ struct ntsync_device *dev = event->dev; -+ __u32 prev_state; -+ bool all; -+ -+ if (event->type != NTSYNC_TYPE_EVENT) -+ return -EINVAL; -+ -+ all = ntsync_lock_obj(dev, event); -+ -+ prev_state = event->u.event.signaled; -+ event->u.event.signaled = false; -+ -+ ntsync_unlock_obj(dev, event, all); -+ -+ if (put_user(prev_state, (__u32 __user *)argp)) -+ return -EFAULT; -+ -+ return 0; -+} -+ - static int ntsync_obj_release(struct inode *inode, struct file *file) - { - struct ntsync_obj *obj = file->private_data; -@@ -584,6 +606,8 @@ static long ntsync_obj_ioctl(struct file - return ntsync_mutex_kill(obj, argp); - case NTSYNC_IOC_EVENT_SET: - return ntsync_event_set(obj, argp); -+ case NTSYNC_IOC_EVENT_RESET: -+ return ntsync_event_reset(obj, argp); - default: - return -ENOIOCTLCMD; - } ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -49,5 +49,6 @@ struct ntsync_wait_args { - #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) - #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) - #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) -+#define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) - - #endif diff --git a/debian/patches/misc-ntsync7/0011-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch b/debian/patches/misc-ntsync7/0011-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch deleted file mode 100644 index c335c3e..0000000 --- a/debian/patches/misc-ntsync7/0011-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 50c791dde217f9fdc1785de77fa2ae888d6bdb4e Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:52 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_EVENT_PULSE. - -This corresponds to the NT syscall NtPulseEvent(). - -This wakes up any waiters as if the event had been set, but does not set the -event, instead resetting it if it had been signalled. Thus, for a manual-reset -event, all waiters are woken, whereas for an auto-reset event, at most one -waiter is woken. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 8 ++++++-- - include/uapi/linux/ntsync.h | 1 + - 2 files changed, 7 insertions(+), 2 deletions(-) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -534,7 +534,7 @@ static int ntsync_mutex_kill(struct ntsy - return ret; - } - --static int ntsync_event_set(struct ntsync_obj *event, void __user *argp) -+static int ntsync_event_set(struct ntsync_obj *event, void __user *argp, bool pulse) - { - struct ntsync_device *dev = event->dev; - __u32 prev_state; -@@ -550,6 +550,8 @@ static int ntsync_event_set(struct ntsyn - if (all) - try_wake_all_obj(dev, event); - try_wake_any_event(event); -+ if (pulse) -+ event->u.event.signaled = false; - - ntsync_unlock_obj(dev, event, all); - -@@ -605,9 +607,11 @@ static long ntsync_obj_ioctl(struct file - case NTSYNC_IOC_MUTEX_KILL: - return ntsync_mutex_kill(obj, argp); - case NTSYNC_IOC_EVENT_SET: -- return ntsync_event_set(obj, argp); -+ return ntsync_event_set(obj, argp, false); - case NTSYNC_IOC_EVENT_RESET: - return ntsync_event_reset(obj, argp); -+ case NTSYNC_IOC_EVENT_PULSE: -+ return ntsync_event_set(obj, argp, true); - default: - return -ENOIOCTLCMD; - } ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -50,5 +50,6 @@ struct ntsync_wait_args { - #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) - #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) - #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) -+#define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) - - #endif diff --git a/debian/patches/misc-ntsync7/0012-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch b/debian/patches/misc-ntsync7/0012-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch deleted file mode 100644 index 178ab19..0000000 --- a/debian/patches/misc-ntsync7/0012-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 248013d9877d47dc5219344268c10b62de1f52f2 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:53 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_SEM_READ. - -This corresponds to the NT syscall NtQuerySemaphore(). - -This returns the current count and maximum count of the semaphore. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 24 ++++++++++++++++++++++++ - include/uapi/linux/ntsync.h | 1 + - 2 files changed, 25 insertions(+) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -583,6 +583,28 @@ static int ntsync_event_reset(struct nts - return 0; - } - -+static int ntsync_sem_read(struct ntsync_obj *sem, void __user *argp) -+{ -+ struct ntsync_sem_args __user *user_args = argp; -+ struct ntsync_device *dev = sem->dev; -+ struct ntsync_sem_args args; -+ bool all; -+ -+ if (sem->type != NTSYNC_TYPE_SEM) -+ return -EINVAL; -+ -+ all = ntsync_lock_obj(dev, sem); -+ -+ args.count = sem->u.sem.count; -+ args.max = sem->u.sem.max; -+ -+ ntsync_unlock_obj(dev, sem, all); -+ -+ if (copy_to_user(user_args, &args, sizeof(args))) -+ return -EFAULT; -+ return 0; -+} -+ - static int ntsync_obj_release(struct inode *inode, struct file *file) - { - struct ntsync_obj *obj = file->private_data; -@@ -602,6 +624,8 @@ static long ntsync_obj_ioctl(struct file - switch (cmd) { - case NTSYNC_IOC_SEM_RELEASE: - return ntsync_sem_release(obj, argp); -+ case NTSYNC_IOC_SEM_READ: -+ return ntsync_sem_read(obj, argp); - case NTSYNC_IOC_MUTEX_UNLOCK: - return ntsync_mutex_unlock(obj, argp); - case NTSYNC_IOC_MUTEX_KILL: ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -51,5 +51,6 @@ struct ntsync_wait_args { - #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) - #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) - #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) -+#define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) - - #endif diff --git a/debian/patches/misc-ntsync7/0013-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch b/debian/patches/misc-ntsync7/0013-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch deleted file mode 100644 index c40686e..0000000 --- a/debian/patches/misc-ntsync7/0013-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch +++ /dev/null @@ -1,66 +0,0 @@ -From 8fc7a993fd8bc6b1a09b4b965bee7d16bb2156cc Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:54 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_READ. - -This corresponds to the NT syscall NtQueryMutant(). - -This returns the recursion count, owner, and abandoned state of the mutex. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 26 ++++++++++++++++++++++++++ - include/uapi/linux/ntsync.h | 1 + - 2 files changed, 27 insertions(+) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -605,6 +605,30 @@ static int ntsync_sem_read(struct ntsync - return 0; - } - -+static int ntsync_mutex_read(struct ntsync_obj *mutex, void __user *argp) -+{ -+ struct ntsync_mutex_args __user *user_args = argp; -+ struct ntsync_device *dev = mutex->dev; -+ struct ntsync_mutex_args args; -+ bool all; -+ int ret; -+ -+ if (mutex->type != NTSYNC_TYPE_MUTEX) -+ return -EINVAL; -+ -+ all = ntsync_lock_obj(dev, mutex); -+ -+ args.count = mutex->u.mutex.count; -+ args.owner = mutex->u.mutex.owner; -+ ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0; -+ -+ ntsync_unlock_obj(dev, mutex, all); -+ -+ if (copy_to_user(user_args, &args, sizeof(args))) -+ return -EFAULT; -+ return ret; -+} -+ - static int ntsync_obj_release(struct inode *inode, struct file *file) - { - struct ntsync_obj *obj = file->private_data; -@@ -630,6 +654,8 @@ static long ntsync_obj_ioctl(struct file - return ntsync_mutex_unlock(obj, argp); - case NTSYNC_IOC_MUTEX_KILL: - return ntsync_mutex_kill(obj, argp); -+ case NTSYNC_IOC_MUTEX_READ: -+ return ntsync_mutex_read(obj, argp); - case NTSYNC_IOC_EVENT_SET: - return ntsync_event_set(obj, argp, false); - case NTSYNC_IOC_EVENT_RESET: ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -52,5 +52,6 @@ struct ntsync_wait_args { - #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) - #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) - #define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) -+#define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args) - - #endif diff --git a/debian/patches/misc-ntsync7/0014-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch b/debian/patches/misc-ntsync7/0014-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch deleted file mode 100644 index 88597a7..0000000 --- a/debian/patches/misc-ntsync7/0014-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch +++ /dev/null @@ -1,64 +0,0 @@ -From aed34cc9c28dba5e3735d7c59e1970a32eefc5f4 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:55 -0600 -Subject: ntsync: Introduce NTSYNC_IOC_EVENT_READ. - -This corresponds to the NT syscall NtQueryEvent(). - -This returns the signaled state of the event and whether it is manual-reset. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 24 ++++++++++++++++++++++++ - include/uapi/linux/ntsync.h | 1 + - 2 files changed, 25 insertions(+) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -629,6 +629,28 @@ static int ntsync_mutex_read(struct ntsy - return ret; - } - -+static int ntsync_event_read(struct ntsync_obj *event, void __user *argp) -+{ -+ struct ntsync_event_args __user *user_args = argp; -+ struct ntsync_device *dev = event->dev; -+ struct ntsync_event_args args; -+ bool all; -+ -+ if (event->type != NTSYNC_TYPE_EVENT) -+ return -EINVAL; -+ -+ all = ntsync_lock_obj(dev, event); -+ -+ args.manual = event->u.event.manual; -+ args.signaled = event->u.event.signaled; -+ -+ ntsync_unlock_obj(dev, event, all); -+ -+ if (copy_to_user(user_args, &args, sizeof(args))) -+ return -EFAULT; -+ return 0; -+} -+ - static int ntsync_obj_release(struct inode *inode, struct file *file) - { - struct ntsync_obj *obj = file->private_data; -@@ -662,6 +684,8 @@ static long ntsync_obj_ioctl(struct file - return ntsync_event_reset(obj, argp); - case NTSYNC_IOC_EVENT_PULSE: - return ntsync_event_set(obj, argp, true); -+ case NTSYNC_IOC_EVENT_READ: -+ return ntsync_event_read(obj, argp); - default: - return -ENOIOCTLCMD; - } ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -53,5 +53,6 @@ struct ntsync_wait_args { - #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) - #define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) - #define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args) -+#define NTSYNC_IOC_EVENT_READ _IOR ('N', 0x8d, struct ntsync_event_args) - - #endif diff --git a/debian/patches/misc-ntsync7/0015-ntsync-Introduce-alertable-waits.patch b/debian/patches/misc-ntsync7/0015-ntsync-Introduce-alertable-waits.patch deleted file mode 100644 index 2dd76e0..0000000 --- a/debian/patches/misc-ntsync7/0015-ntsync-Introduce-alertable-waits.patch +++ /dev/null @@ -1,187 +0,0 @@ -From 361a7fb848ba9cac87855cb68f9ab000ed1027be Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:34:56 -0600 -Subject: ntsync: Introduce alertable waits. - -NT waits can optionally be made "alertable". This is a special channel for -thread wakeup that is mildly similar to SIGIO. A thread has an internal single -bit of "alerted" state, and if a thread is alerted while an alertable wait, the -wait will return a special value, consume the "alerted" state, and will not -consume any of its objects. - -Alerts are implemented using events; the user-space NT emulator is expected to -create an internal ntsync event for each thread and pass that event to wait -functions. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/ntsync.c | 70 ++++++++++++++++++++++++++++++++----- - include/uapi/linux/ntsync.h | 3 +- - 2 files changed, 63 insertions(+), 10 deletions(-) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -869,22 +869,29 @@ static int setup_wait(struct ntsync_devi - const struct ntsync_wait_args *args, bool all, - struct ntsync_q **ret_q) - { -+ int fds[NTSYNC_MAX_WAIT_COUNT + 1]; - const __u32 count = args->count; -- int fds[NTSYNC_MAX_WAIT_COUNT]; - struct ntsync_q *q; -+ __u32 total_count; - __u32 i, j; - -- if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME)) -+ if (args->pad || (args->flags & ~NTSYNC_WAIT_REALTIME)) - return -EINVAL; - - if (args->count > NTSYNC_MAX_WAIT_COUNT) - return -EINVAL; - -+ total_count = count; -+ if (args->alert) -+ total_count++; -+ - if (copy_from_user(fds, u64_to_user_ptr(args->objs), - array_size(count, sizeof(*fds)))) - return -EFAULT; -+ if (args->alert) -+ fds[count] = args->alert; - -- q = kmalloc(struct_size(q, entries, count), GFP_KERNEL); -+ q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL); - if (!q) - return -ENOMEM; - q->task = current; -@@ -894,7 +901,7 @@ static int setup_wait(struct ntsync_devi - q->ownerdead = false; - q->count = count; - -- for (i = 0; i < count; i++) { -+ for (i = 0; i < total_count; i++) { - struct ntsync_q_entry *entry = &q->entries[i]; - struct ntsync_obj *obj = get_obj(dev, fds[i]); - -@@ -944,10 +951,10 @@ static void try_wake_any_obj(struct ntsy - static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) - { - struct ntsync_wait_args args; -+ __u32 i, total_count; - struct ntsync_q *q; - int signaled; - bool all; -- __u32 i; - int ret; - - if (copy_from_user(&args, argp, sizeof(args))) -@@ -957,9 +964,13 @@ static int ntsync_wait_any(struct ntsync - if (ret < 0) - return ret; - -+ total_count = args.count; -+ if (args.alert) -+ total_count++; -+ - /* queue ourselves */ - -- for (i = 0; i < args.count; i++) { -+ for (i = 0; i < total_count; i++) { - struct ntsync_q_entry *entry = &q->entries[i]; - struct ntsync_obj *obj = entry->obj; - -@@ -968,9 +979,15 @@ static int ntsync_wait_any(struct ntsync - ntsync_unlock_obj(dev, obj, all); - } - -- /* check if we are already signaled */ -+ /* -+ * Check if we are already signaled. -+ * -+ * Note that the API requires that normal objects are checked before -+ * the alert event. Hence we queue the alert event last, and check -+ * objects in order. -+ */ - -- for (i = 0; i < args.count; i++) { -+ for (i = 0; i < total_count; i++) { - struct ntsync_obj *obj = q->entries[i].obj; - - if (atomic_read(&q->signaled) != -1) -@@ -987,7 +1004,7 @@ static int ntsync_wait_any(struct ntsync - - /* and finally, unqueue */ - -- for (i = 0; i < args.count; i++) { -+ for (i = 0; i < total_count; i++) { - struct ntsync_q_entry *entry = &q->entries[i]; - struct ntsync_obj *obj = entry->obj; - -@@ -1047,6 +1064,14 @@ static int ntsync_wait_all(struct ntsync - */ - list_add_tail(&entry->node, &obj->all_waiters); - } -+ if (args.alert) { -+ struct ntsync_q_entry *entry = &q->entries[args.count]; -+ struct ntsync_obj *obj = entry->obj; -+ -+ dev_lock_obj(dev, obj); -+ list_add_tail(&entry->node, &obj->any_waiters); -+ dev_unlock_obj(dev, obj); -+ } - - /* check if we are already signaled */ - -@@ -1054,6 +1079,21 @@ static int ntsync_wait_all(struct ntsync - - mutex_unlock(&dev->wait_all_lock); - -+ /* -+ * Check if the alert event is signaled, making sure to do so only -+ * after checking if the other objects are signaled. -+ */ -+ -+ if (args.alert) { -+ struct ntsync_obj *obj = q->entries[args.count].obj; -+ -+ if (atomic_read(&q->signaled) == -1) { -+ bool all = ntsync_lock_obj(dev, obj); -+ try_wake_any_obj(obj); -+ ntsync_unlock_obj(dev, obj, all); -+ } -+ } -+ - /* sleep */ - - ret = ntsync_schedule(q, &args); -@@ -1079,6 +1119,18 @@ static int ntsync_wait_all(struct ntsync - - mutex_unlock(&dev->wait_all_lock); - -+ if (args.alert) { -+ struct ntsync_q_entry *entry = &q->entries[args.count]; -+ struct ntsync_obj *obj = entry->obj; -+ bool all; -+ -+ all = ntsync_lock_obj(dev, obj); -+ list_del(&entry->node); -+ ntsync_unlock_obj(dev, obj, all); -+ -+ put_obj(obj); -+ } -+ - signaled = atomic_read(&q->signaled); - if (signaled != -1) { - struct ntsync_wait_args __user *user_args = argp; ---- a/include/uapi/linux/ntsync.h -+++ b/include/uapi/linux/ntsync.h -@@ -34,7 +34,8 @@ struct ntsync_wait_args { - __u32 index; - __u32 flags; - __u32 owner; -- __u32 pad[2]; -+ __u32 alert; -+ __u32 pad; - }; - - #define NTSYNC_MAX_WAIT_COUNT 64 diff --git a/debian/patches/misc-ntsync7/0016-maintainers-Add-an-entry-for-ntsync.patch b/debian/patches/misc-ntsync7/0016-maintainers-Add-an-entry-for-ntsync.patch deleted file mode 100644 index ed5de57..0000000 --- a/debian/patches/misc-ntsync7/0016-maintainers-Add-an-entry-for-ntsync.patch +++ /dev/null @@ -1,30 +0,0 @@ -From b240b27e5348d38acbc4a12f1dc762dd1845f391 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:35:09 -0600 -Subject: maintainers: Add an entry for ntsync. - -Add myself as maintainer, supported by CodeWeavers. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - MAINTAINERS | 9 +++++++++ - 1 file changed, 9 insertions(+) - ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -16486,6 +16486,15 @@ T: git https://github.com/Paragon-Softwa - F: Documentation/filesystems/ntfs3.rst - F: fs/ntfs3/ - -+NTSYNC SYNCHRONIZATION PRIMITIVE DRIVER -+M: Elizabeth Figura <zfigura@codeweavers.com> -+L: wine-devel@winehq.org -+S: Supported -+F: Documentation/userspace-api/ntsync.rst -+F: drivers/misc/ntsync.c -+F: include/uapi/linux/ntsync.h -+F: tools/testing/selftests/drivers/ntsync/ -+ - NUBUS SUBSYSTEM - M: Finn Thain <fthain@linux-m68k.org> - L: linux-m68k@lists.linux-m68k.org diff --git a/debian/patches/misc-ntsync7/0017-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch b/debian/patches/misc-ntsync7/0017-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch deleted file mode 100644 index c8582c0..0000000 --- a/debian/patches/misc-ntsync7/0017-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch +++ /dev/null @@ -1,413 +0,0 @@ -From 733e310bb840117593a0eb4726fa63b34fea9cc3 Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:35:10 -0600 -Subject: docs: ntsync: Add documentation for the ntsync uAPI. - -Add an overall explanation of the driver architecture, and complete and precise -specification for its intended behaviour. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - Documentation/userspace-api/index.rst | 1 + - Documentation/userspace-api/ntsync.rst | 385 +++++++++++++++++++++++++ - 2 files changed, 386 insertions(+) - create mode 100644 Documentation/userspace-api/ntsync.rst - ---- a/Documentation/userspace-api/index.rst -+++ b/Documentation/userspace-api/index.rst -@@ -63,6 +63,7 @@ Everything else - vduse - futex2 - perf_ring_buffer -+ ntsync - - .. only:: subproject and html - ---- /dev/null -+++ b/Documentation/userspace-api/ntsync.rst -@@ -0,0 +1,385 @@ -+=================================== -+NT synchronization primitive driver -+=================================== -+ -+This page documents the user-space API for the ntsync driver. -+ -+ntsync is a support driver for emulation of NT synchronization -+primitives by user-space NT emulators. It exists because implementation -+in user-space, using existing tools, cannot match Windows performance -+while offering accurate semantics. It is implemented entirely in -+software, and does not drive any hardware device. -+ -+This interface is meant as a compatibility tool only, and should not -+be used for general synchronization. Instead use generic, versatile -+interfaces such as futex(2) and poll(2). -+ -+Synchronization primitives -+========================== -+ -+The ntsync driver exposes three types of synchronization primitives: -+semaphores, mutexes, and events. -+ -+A semaphore holds a single volatile 32-bit counter, and a static 32-bit -+integer denoting the maximum value. It is considered signaled (that is, -+can be acquired without contention, or will wake up a waiting thread) -+when the counter is nonzero. The counter is decremented by one when a -+wait is satisfied. Both the initial and maximum count are established -+when the semaphore is created. -+ -+A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit -+identifier denoting its owner. A mutex is considered signaled when its -+owner is zero (indicating that it is not owned). The recursion count is -+incremented when a wait is satisfied, and ownership is set to the given -+identifier. -+ -+A mutex also holds an internal flag denoting whether its previous owner -+has died; such a mutex is said to be abandoned. Owner death is not -+tracked automatically based on thread death, but rather must be -+communicated using ``NTSYNC_IOC_MUTEX_KILL``. An abandoned mutex is -+inherently considered unowned. -+ -+Except for the "unowned" semantics of zero, the actual value of the -+owner identifier is not interpreted by the ntsync driver at all. The -+intended use is to store a thread identifier; however, the ntsync -+driver does not actually validate that a calling thread provides -+consistent or unique identifiers. -+ -+An event is similar to a semaphore with a maximum count of one. It holds -+a volatile boolean state denoting whether it is signaled or not. There -+are two types of events, auto-reset and manual-reset. An auto-reset -+event is designaled when a wait is satisfied; a manual-reset event is -+not. The event type is specified when the event is created. -+ -+Unless specified otherwise, all operations on an object are atomic and -+totally ordered with respect to other operations on the same object. -+ -+Objects are represented by files. When all file descriptors to an -+object are closed, that object is deleted. -+ -+Char device -+=========== -+ -+The ntsync driver creates a single char device /dev/ntsync. Each file -+description opened on the device represents a unique instance intended -+to back an individual NT virtual machine. Objects created by one ntsync -+instance may only be used with other objects created by the same -+instance. -+ -+ioctl reference -+=============== -+ -+All operations on the device are done through ioctls. There are four -+structures used in ioctl calls:: -+ -+ struct ntsync_sem_args { -+ __u32 count; -+ __u32 max; -+ }; -+ -+ struct ntsync_mutex_args { -+ __u32 owner; -+ __u32 count; -+ }; -+ -+ struct ntsync_event_args { -+ __u32 signaled; -+ __u32 manual; -+ }; -+ -+ struct ntsync_wait_args { -+ __u64 timeout; -+ __u64 objs; -+ __u32 count; -+ __u32 owner; -+ __u32 index; -+ __u32 alert; -+ __u32 flags; -+ __u32 pad; -+ }; -+ -+Depending on the ioctl, members of the structure may be used as input, -+output, or not at all. -+ -+The ioctls on the device file are as follows: -+ -+.. c:macro:: NTSYNC_IOC_CREATE_SEM -+ -+ Create a semaphore object. Takes a pointer to struct -+ :c:type:`ntsync_sem_args`, which is used as follows: -+ -+ .. list-table:: -+ -+ * - ``count`` -+ - Initial count of the semaphore. -+ * - ``max`` -+ - Maximum count of the semaphore. -+ -+ Fails with ``EINVAL`` if ``count`` is greater than ``max``. -+ On success, returns a file descriptor the created semaphore. -+ -+.. c:macro:: NTSYNC_IOC_CREATE_MUTEX -+ -+ Create a mutex object. Takes a pointer to struct -+ :c:type:`ntsync_mutex_args`, which is used as follows: -+ -+ .. list-table:: -+ -+ * - ``count`` -+ - Initial recursion count of the mutex. -+ * - ``owner`` -+ - Initial owner of the mutex. -+ -+ If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is -+ zero and ``count`` is nonzero, the function fails with ``EINVAL``. -+ On success, returns a file descriptor the created mutex. -+ -+.. c:macro:: NTSYNC_IOC_CREATE_EVENT -+ -+ Create an event object. Takes a pointer to struct -+ :c:type:`ntsync_event_args`, which is used as follows: -+ -+ .. list-table:: -+ -+ * - ``signaled`` -+ - If nonzero, the event is initially signaled, otherwise -+ nonsignaled. -+ * - ``manual`` -+ - If nonzero, the event is a manual-reset event, otherwise -+ auto-reset. -+ -+ On success, returns a file descriptor the created event. -+ -+The ioctls on the individual objects are as follows: -+ -+.. c:macro:: NTSYNC_IOC_SEM_POST -+ -+ Post to a semaphore object. Takes a pointer to a 32-bit integer, -+ which on input holds the count to be added to the semaphore, and on -+ output contains its previous count. -+ -+ If adding to the semaphore's current count would raise the latter -+ past the semaphore's maximum count, the ioctl fails with -+ ``EOVERFLOW`` and the semaphore is not affected. If raising the -+ semaphore's count causes it to become signaled, eligible threads -+ waiting on this semaphore will be woken and the semaphore's count -+ decremented appropriately. -+ -+.. c:macro:: NTSYNC_IOC_MUTEX_UNLOCK -+ -+ Release a mutex object. Takes a pointer to struct -+ :c:type:`ntsync_mutex_args`, which is used as follows: -+ -+ .. list-table:: -+ -+ * - ``owner`` -+ - Specifies the owner trying to release this mutex. -+ * - ``count`` -+ - On output, contains the previous recursion count. -+ -+ If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner`` -+ is not the current owner of the mutex, the ioctl fails with -+ ``EPERM``. -+ -+ The mutex's count will be decremented by one. If decrementing the -+ mutex's count causes it to become zero, the mutex is marked as -+ unowned and signaled, and eligible threads waiting on it will be -+ woken as appropriate. -+ -+.. c:macro:: NTSYNC_IOC_SET_EVENT -+ -+ Signal an event object. Takes a pointer to a 32-bit integer, which on -+ output contains the previous state of the event. -+ -+ Eligible threads will be woken, and auto-reset events will be -+ designaled appropriately. -+ -+.. c:macro:: NTSYNC_IOC_RESET_EVENT -+ -+ Designal an event object. Takes a pointer to a 32-bit integer, which -+ on output contains the previous state of the event. -+ -+.. c:macro:: NTSYNC_IOC_PULSE_EVENT -+ -+ Wake threads waiting on an event object while leaving it in an -+ unsignaled state. Takes a pointer to a 32-bit integer, which on -+ output contains the previous state of the event. -+ -+ A pulse operation can be thought of as a set followed by a reset, -+ performed as a single atomic operation. If two threads are waiting on -+ an auto-reset event which is pulsed, only one will be woken. If two -+ threads are waiting a manual-reset event which is pulsed, both will -+ be woken. However, in both cases, the event will be unsignaled -+ afterwards, and a simultaneous read operation will always report the -+ event as unsignaled. -+ -+.. c:macro:: NTSYNC_IOC_READ_SEM -+ -+ Read the current state of a semaphore object. Takes a pointer to -+ struct :c:type:`ntsync_sem_args`, which is used as follows: -+ -+ .. list-table:: -+ -+ * - ``count`` -+ - On output, contains the current count of the semaphore. -+ * - ``max`` -+ - On output, contains the maximum count of the semaphore. -+ -+.. c:macro:: NTSYNC_IOC_READ_MUTEX -+ -+ Read the current state of a mutex object. Takes a pointer to struct -+ :c:type:`ntsync_mutex_args`, which is used as follows: -+ -+ .. list-table:: -+ -+ * - ``owner`` -+ - On output, contains the current owner of the mutex, or zero -+ if the mutex is not currently owned. -+ * - ``count`` -+ - On output, contains the current recursion count of the mutex. -+ -+ If the mutex is marked as abandoned, the function fails with -+ ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to -+ zero. -+ -+.. c:macro:: NTSYNC_IOC_READ_EVENT -+ -+ Read the current state of an event object. Takes a pointer to struct -+ :c:type:`ntsync_event_args`, which is used as follows: -+ -+ .. list-table:: -+ -+ * - ``signaled`` -+ - On output, contains the current state of the event. -+ * - ``manual`` -+ - On output, contains 1 if the event is a manual-reset event, -+ and 0 otherwise. -+ -+.. c:macro:: NTSYNC_IOC_KILL_OWNER -+ -+ Mark a mutex as unowned and abandoned if it is owned by the given -+ owner. Takes an input-only pointer to a 32-bit integer denoting the -+ owner. If the owner is zero, the ioctl fails with ``EINVAL``. If the -+ owner does not own the mutex, the function fails with ``EPERM``. -+ -+ Eligible threads waiting on the mutex will be woken as appropriate -+ (and such waits will fail with ``EOWNERDEAD``, as described below). -+ -+.. c:macro:: NTSYNC_IOC_WAIT_ANY -+ -+ Poll on any of a list of objects, atomically acquiring at most one. -+ Takes a pointer to struct :c:type:`ntsync_wait_args`, which is -+ used as follows: -+ -+ .. list-table:: -+ -+ * - ``timeout`` -+ - Absolute timeout in nanoseconds. If ``NTSYNC_WAIT_REALTIME`` -+ is set, the timeout is measured against the REALTIME clock; -+ otherwise it is measured against the MONOTONIC clock. If the -+ timeout is equal to or earlier than the current time, the -+ function returns immediately without sleeping. If ``timeout`` -+ is U64_MAX, the function will sleep until an object is -+ signaled, and will not fail with ``ETIMEDOUT``. -+ * - ``objs`` -+ - Pointer to an array of ``count`` file descriptors -+ (specified as an integer so that the structure has the same -+ size regardless of architecture). If any object is -+ invalid, the function fails with ``EINVAL``. -+ * - ``count`` -+ - Number of objects specified in the ``objs`` array. -+ If greater than ``NTSYNC_MAX_WAIT_COUNT``, the function fails -+ with ``EINVAL``. -+ * - ``owner`` -+ - Mutex owner identifier. If any object in ``objs`` is a mutex, -+ the ioctl will attempt to acquire that mutex on behalf of -+ ``owner``. If ``owner`` is zero, the ioctl fails with -+ ``EINVAL``. -+ * - ``index`` -+ - On success, contains the index (into ``objs``) of the object -+ which was signaled. If ``alert`` was signaled instead, -+ this contains ``count``. -+ * - ``alert`` -+ - Optional event object file descriptor. If nonzero, this -+ specifies an "alert" event object which, if signaled, will -+ terminate the wait. If nonzero, the identifier must point to a -+ valid event. -+ * - ``flags`` -+ - Zero or more flags. Currently the only flag is -+ ``NTSYNC_WAIT_REALTIME``, which causes the timeout to be -+ measured against the REALTIME clock instead of MONOTONIC. -+ * - ``pad`` -+ - Unused, must be set to zero. -+ -+ This function attempts to acquire one of the given objects. If unable -+ to do so, it sleeps until an object becomes signaled, subsequently -+ acquiring it, or the timeout expires. In the latter case the ioctl -+ fails with ``ETIMEDOUT``. The function only acquires one object, even -+ if multiple objects are signaled. -+ -+ A semaphore is considered to be signaled if its count is nonzero, and -+ is acquired by decrementing its count by one. A mutex is considered -+ to be signaled if it is unowned or if its owner matches the ``owner`` -+ argument, and is acquired by incrementing its recursion count by one -+ and setting its owner to the ``owner`` argument. An auto-reset event -+ is acquired by designaling it; a manual-reset event is not affected -+ by acquisition. -+ -+ Acquisition is atomic and totally ordered with respect to other -+ operations on the same object. If two wait operations (with different -+ ``owner`` identifiers) are queued on the same mutex, only one is -+ signaled. If two wait operations are queued on the same semaphore, -+ and a value of one is posted to it, only one is signaled. -+ -+ If an abandoned mutex is acquired, the ioctl fails with -+ ``EOWNERDEAD``. Although this is a failure return, the function may -+ otherwise be considered successful. The mutex is marked as owned by -+ the given owner (with a recursion count of 1) and as no longer -+ abandoned, and ``index`` is still set to the index of the mutex. -+ -+ The ``alert`` argument is an "extra" event which can terminate the -+ wait, independently of all other objects. -+ -+ It is valid to pass the same object more than once, including by -+ passing the same event in the ``objs`` array and in ``alert``. If a -+ wakeup occurs due to that object being signaled, ``index`` is set to -+ the lowest index corresponding to that object. -+ -+ The function may fail with ``EINTR`` if a signal is received. -+ -+.. c:macro:: NTSYNC_IOC_WAIT_ALL -+ -+ Poll on a list of objects, atomically acquiring all of them. Takes a -+ pointer to struct :c:type:`ntsync_wait_args`, which is used -+ identically to ``NTSYNC_IOC_WAIT_ANY``, except that ``index`` is -+ always filled with zero on success if not woken via alert. -+ -+ This function attempts to simultaneously acquire all of the given -+ objects. If unable to do so, it sleeps until all objects become -+ simultaneously signaled, subsequently acquiring them, or the timeout -+ expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and no -+ objects are modified. -+ -+ Objects may become signaled and subsequently designaled (through -+ acquisition by other threads) while this thread is sleeping. Only -+ once all objects are simultaneously signaled does the ioctl acquire -+ them and return. The entire acquisition is atomic and totally ordered -+ with respect to other operations on any of the given objects. -+ -+ If an abandoned mutex is acquired, the ioctl fails with -+ ``EOWNERDEAD``. Similarly to ``NTSYNC_IOC_WAIT_ANY``, all objects are -+ nevertheless marked as acquired. Note that if multiple mutex objects -+ are specified, there is no way to know which were marked as -+ abandoned. -+ -+ As with "any" waits, the ``alert`` argument is an "extra" event which -+ can terminate the wait. Critically, however, an "all" wait will -+ succeed if all members in ``objs`` are signaled, *or* if ``alert`` is -+ signaled. In the latter case ``index`` will be set to ``count``. As -+ with "any" waits, if both conditions are filled, the former takes -+ priority, and objects in ``objs`` will be acquired. -+ -+ Unlike ``NTSYNC_IOC_WAIT_ANY``, it is not valid to pass the same -+ object more than once, nor is it valid to pass the same object in -+ ``objs`` and in ``alert``. If this is attempted, the function fails -+ with ``EINVAL``. diff --git a/debian/patches/misc-ntsync7/0018-ntsync-No-longer-depend-on-BROKEN.patch b/debian/patches/misc-ntsync7/0018-ntsync-No-longer-depend-on-BROKEN.patch deleted file mode 100644 index ea9b498..0000000 --- a/debian/patches/misc-ntsync7/0018-ntsync-No-longer-depend-on-BROKEN.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 4871bb89577d78a3d55b44e47c3a4f677dbdc89b Mon Sep 17 00:00:00 2001 -From: Elizabeth Figura <zfigura@codeweavers.com> -Date: Fri, 13 Dec 2024 13:35:11 -0600 -Subject: ntsync: No longer depend on BROKEN. - -f5b335dc025cfee90957efa90dc72fada0d5abb4 ("misc: ntsync: mark driver as "broken" -to prevent from building") was committed to avoid the driver being used while -only part of its functionality was released. Since the rest of the functionality -has now been committed, revert this. - -Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com> ---- - drivers/misc/Kconfig | 1 - - 1 file changed, 1 deletion(-) - ---- a/drivers/misc/Kconfig -+++ b/drivers/misc/Kconfig -@@ -517,7 +517,6 @@ config OPEN_DICE - - config NTSYNC - tristate "NT synchronization primitive emulation" -- depends on BROKEN - help - This module provides kernel support for emulation of Windows NT - synchronization primitives. It is not a hardware driver. diff --git a/debian/patches/misc-ntsync7/0019-ntsync-Set-the-permissions-to-be-0666.patch b/debian/patches/misc-ntsync7/0019-ntsync-Set-the-permissions-to-be-0666.patch deleted file mode 100644 index 809b5f3..0000000 --- a/debian/patches/misc-ntsync7/0019-ntsync-Set-the-permissions-to-be-0666.patch +++ /dev/null @@ -1,22 +0,0 @@ -From 2aa0cab3a568e6adccbe708ee2e79185638860ed Mon Sep 17 00:00:00 2001 -From: Mike Lothian <mike@fireburn.co.uk> -Date: Fri, 14 Feb 2025 12:28:00 +0000 -Subject: ntsync: Set the permissions to be 0666 - -This allows ntsync to be usuable by non-root processes out of the box - -Signed-off-by: Mike Lothian <mike@fireburn.co.uk> ---- - drivers/misc/ntsync.c | 1 + - 1 file changed, 1 insertion(+) - ---- a/drivers/misc/ntsync.c -+++ b/drivers/misc/ntsync.c -@@ -1206,6 +1206,7 @@ static struct miscdevice ntsync_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = NTSYNC_NAME, - .fops = &ntsync_fops, -+ .mode = 0666, // Setting file permissions to 0666 - }; - - module_misc_device(ntsync_misc); diff --git a/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch b/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch index 9a93cc6..6d08d65 100644 --- a/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch +++ b/debian/patches/misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch @@ -21,12 +21,12 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -62,7 +62,8 @@ bool __ieee80211_recalc_txpower(struct i - if (sdata->deflink.user_power_level != IEEE80211_UNSET_POWER_LEVEL) - power = min(power, sdata->deflink.user_power_level); + if (link->user_power_level != IEEE80211_UNSET_POWER_LEVEL) + power = min(power, link->user_power_level); -- if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL) -+ if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL && -+ sdata->vif.bss_conf.txpower_type != NL80211_TX_POWER_FIXED) - power = min(power, sdata->deflink.ap_power_level); +- if (link->ap_power_level != IEEE80211_UNSET_POWER_LEVEL) ++ if (link->ap_power_level != IEEE80211_UNSET_POWER_LEVEL && ++ link->conf->txpower_type != NL80211_TX_POWER_FIXED) + power = min(power, link->ap_power_level); - if (power != sdata->vif.bss_conf.txpower) { + if (power != link->conf->txpower) { diff --git a/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch b/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch index e7bf781..f83b2a7 100644 --- a/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch +++ b/debian/patches/mixed-arch/0001-ZEN-Add-graysky-s-more-ISA-levels-and-uarches.patch @@ -1,4 +1,4 @@ -From 421120bda34d994c5e0e07a89e2f9c40c53e8e87 Mon Sep 17 00:00:00 2001 +From 90b69178f6a866c7f3330c2006f6b5396146192c Mon Sep 17 00:00:00 2001 From: graysky <therealgraysky AT proton DOT me> Date: Mon, 16 Sep 2024 05:55:58 -0400 Subject: ZEN: Add graysky's more-uarches @@ -123,10 +123,10 @@ REFERENCES 3. https://github.com/graysky2/kernel_gcc_patch/issues/15 4. http://www.linuxforge.net/docs/linux/linux-gcc.php --- - arch/x86/Kconfig.cpu | 359 ++++++++++++++++++++++++++++++-- - arch/x86/Makefile | 87 +++++++- - arch/x86/include/asm/vermagic.h | 70 +++++++ - 3 files changed, 499 insertions(+), 17 deletions(-) + arch/x86/Kconfig.cpu | 367 ++++++++++++++++++++++++++++++-- + arch/x86/Makefile | 89 +++++++- + arch/x86/include/asm/vermagic.h | 72 +++++++ + 3 files changed, 511 insertions(+), 17 deletions(-) --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -257,7 +257,7 @@ REFERENCES + +config MZEN5 + bool "AMD Zen 5" -+ depends on (CC_IS_GCC && GCC_VERSION > 140000) || (CC_IS_CLANG && CLANG_VERSION >= 191000) ++ depends on (CC_IS_GCC && GCC_VERSION > 140000) || (CC_IS_CLANG && CLANG_VERSION >= 190100) + help + Select this for AMD Family 19h Zen 5 processors. + @@ -285,7 +285,7 @@ REFERENCES help Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and -@@ -278,14 +388,191 @@ config MCORE2 +@@ -278,14 +388,199 @@ config MCORE2 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) @@ -393,14 +393,22 @@ REFERENCES + + Enables -march=cannonlake + -+config MICELAKE ++config MICELAKE_CLIENT + bool "Intel Ice Lake" + help + -+ Select this for 10th Gen Core processors in the Ice Lake family. ++ Select this for 10th Gen Core client processors in the Ice Lake family. + + Enables -march=icelake-client + ++config MICELAKE_SERVER ++ bool "Intel Ice Lake Server" ++ help ++ ++ Select this for 10th Gen Core server processors in the Ice Lake family. ++ ++ Enables -march=icelake-server ++ +config MCASCADELAKE + bool "Intel Cascade Lake" + help @@ -483,7 +491,7 @@ REFERENCES config GENERIC_CPU bool "Generic-x86-64" -@@ -294,6 +581,26 @@ config GENERIC_CPU +@@ -294,6 +589,26 @@ config GENERIC_CPU Generic x86-64 CPU. Run equally well on all x86-64 CPUs. @@ -510,7 +518,7 @@ REFERENCES endchoice config X86_GENERIC -@@ -308,6 +615,30 @@ config X86_GENERIC +@@ -308,6 +623,30 @@ config X86_GENERIC This is really intended for distributors who need more generic optimizations. @@ -541,32 +549,32 @@ REFERENCES # # Define implied options from the CPU selection here config X86_INTERNODE_CACHE_SHIFT -@@ -318,7 +649,7 @@ config X86_INTERNODE_CACHE_SHIFT +@@ -318,7 +657,7 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU -+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD ++ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE_CLIENT || MICELAKE_SERVER || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD default "4" if MELAN || M486SX || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX -@@ -336,11 +667,11 @@ config X86_ALIGNMENT_16 +@@ -336,11 +675,11 @@ config X86_ALIGNMENT_16 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 -+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL ++ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE_CLIENT || MICELAKE_SERVER || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM -+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE_CLIENT || MICELAKE_SERVER || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD # # P6_NOPs are a relatively minor optimization that require a family >= --- a/arch/x86/Makefile +++ b/arch/x86/Makefile -@@ -182,15 +182,96 @@ else +@@ -182,15 +182,98 @@ else cflags-$(CONFIG_MK8) += -march=k8 cflags-$(CONFIG_MPSC) += -march=nocona cflags-$(CONFIG_MCORE2) += -march=core2 @@ -608,7 +616,8 @@ REFERENCES + cflags-$(CONFIG_MSKYLAKE) += -march=skylake + cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 + cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake -+ cflags-$(CONFIG_MICELAKE) += -march=icelake-client ++ cflags-$(CONFIG_MICELAKE_CLIENT) += -march=icelake-client ++ cflags-$(CONFIG_MICELAKE_SERVER) += -march=icelake-server + cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake + cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake + cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake @@ -653,7 +662,8 @@ REFERENCES + rustflags-$(CONFIG_MSKYLAKE) += -Ctarget-cpu=skylake + rustflags-$(CONFIG_MSKYLAKEX) += -Ctarget-cpu=skylake-avx512 + rustflags-$(CONFIG_MCANNONLAKE) += -Ctarget-cpu=cannonlake -+ rustflags-$(CONFIG_MICELAKE) += -Ctarget-cpu=icelake-client ++ rustflags-$(CONFIG_MICELAKE_CLIENT) += -Ctarget-cpu=icelake-client ++ rustflags-$(CONFIG_MICELAKE_SERVER) += -Ctarget-cpu=icelake-server + rustflags-$(CONFIG_MCASCADELAKE) += -Ctarget-cpu=cascadelake + rustflags-$(CONFIG_MCOOPERLAKE) += -Ctarget-cpu=cooperlake + rustflags-$(CONFIG_MTIGERLAKE) += -Ctarget-cpu=tigerlake @@ -668,7 +678,7 @@ REFERENCES KBUILD_CFLAGS += -mno-red-zone --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h -@@ -17,6 +17,54 @@ +@@ -17,6 +17,56 @@ #define MODULE_PROC_FAMILY "586MMX " #elif defined CONFIG_MCORE2 #define MODULE_PROC_FAMILY "CORE2 " @@ -700,8 +710,10 @@ REFERENCES +#define MODULE_PROC_FAMILY "SKYLAKEX " +#elif defined CONFIG_MCANNONLAKE +#define MODULE_PROC_FAMILY "CANNONLAKE " -+#elif defined CONFIG_MICELAKE -+#define MODULE_PROC_FAMILY "ICELAKE " ++#elif defined CONFIG_MICELAKE_CLIENT ++#define MODULE_PROC_FAMILY "ICELAKE_CLIENT " ++#elif defined CONFIG_MICELAKE_SERVER ++#define MODULE_PROC_FAMILY "ICELAKE_SERVER " +#elif defined CONFIG_MCASCADELAKE +#define MODULE_PROC_FAMILY "CASCADELAKE " +#elif defined CONFIG_MCOOPERLAKE @@ -723,7 +735,7 @@ REFERENCES #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 -@@ -35,6 +83,28 @@ +@@ -35,6 +85,28 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " diff --git a/debian/patches/mixed-arch/0002-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch b/debian/patches/mixed-arch/0002-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch index 8720741..34844ef 100644 --- a/debian/patches/mixed-arch/0002-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch +++ b/debian/patches/mixed-arch/0002-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch @@ -1,4 +1,4 @@ -From 1fc1195e784540ad1966b57267fc927a87c5d21d Mon Sep 17 00:00:00 2001 +From f4f448a305e9d705b9a0da102ddfd58bfaac5cc0 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org> Date: Sun, 11 Dec 2022 23:51:16 +0100 Subject: ZEN: Restore CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3 @@ -13,7 +13,7 @@ dependency on CONFIG_ARC and adds RUSTFLAGS. --- a/Makefile +++ b/Makefile -@@ -820,6 +820,9 @@ KBUILD_CFLAGS += -fno-delete-null-pointe +@@ -872,6 +872,9 @@ KBUILD_CFLAGS += -fno-delete-null-pointe ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 KBUILD_RUSTFLAGS += -Copt-level=2 @@ -25,7 +25,7 @@ dependency on CONFIG_ARC and adds RUSTFLAGS. KBUILD_RUSTFLAGS += -Copt-level=s --- a/init/Kconfig +++ b/init/Kconfig -@@ -1451,6 +1451,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE +@@ -1465,6 +1465,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. diff --git a/debian/patches/mixed-arch/0003-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch b/debian/patches/mixed-arch/0003-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch index 51d445b..4bf6bd0 100644 --- a/debian/patches/mixed-arch/0003-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch +++ b/debian/patches/mixed-arch/0003-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch @@ -1,6 +1,6 @@ --- a/Makefile +++ b/Makefile -@@ -828,6 +828,10 @@ KBUILD_CFLAGS += -Os +@@ -880,6 +880,10 @@ KBUILD_CFLAGS += -Os KBUILD_RUSTFLAGS += -Copt-level=s endif diff --git a/debian/patches/mixed-arch/0004-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch b/debian/patches/mixed-arch/0004-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch index f90593e..0257629 100644 --- a/debian/patches/mixed-arch/0004-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch +++ b/debian/patches/mixed-arch/0004-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch @@ -1,8 +1,7 @@ From 3ebc1fdf3e0ee9bff1efe20eb5791eba5c84a810 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Thu, 3 Aug 2023 13:53:49 +0000 -Subject: [PATCH 01/19] XANMOD: x86/build: Prevent generating avx2 and avx512 - floating-point code +Subject: XANMOD: x86/build: Prevent generating avx2 and avx512 floating-point code Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- diff --git a/debian/patches/mixed-arch/0006-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch b/debian/patches/mixed-arch/0006-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch index e15421f..38bebbf 100644 --- a/debian/patches/mixed-arch/0006-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch +++ b/debian/patches/mixed-arch/0006-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch @@ -1,17 +1,16 @@ -From dccbc0ca6c05ae315967a603870d553c231a68a1 Mon Sep 17 00:00:00 2001 +From b1a99a2a9675f80b7c04a239a6b047373ccf3a17 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Mon, 16 Sep 2024 00:55:35 +0000 -Subject: [PATCH 02/18] XANMOD: kbuild: Add GCC SMS-based modulo scheduling - flags +Subject: XANMOD: kbuild: Add GCC SMS-based modulo scheduling flags Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- - Makefile | 7 ++++ - 1 file changed, 7 insertions(+) + Makefile | 4 ++++ + 1 file changed, 4 insertions(+) --- a/Makefile +++ b/Makefile -@@ -832,6 +832,13 @@ ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE +@@ -884,6 +884,13 @@ ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += $(call cc-option,-fivopts) endif diff --git a/debian/patches/mixed-arch/0007-PF-kbuild-6.12-adopt-proposed-upstream-change-for-gener.patch b/debian/patches/mixed-arch/0007-PF-kbuild-6.12-adopt-proposed-upstream-change-for-gener.patch deleted file mode 100644 index 35a2a6b..0000000 --- a/debian/patches/mixed-arch/0007-PF-kbuild-6.12-adopt-proposed-upstream-change-for-gener.patch +++ /dev/null @@ -1,24 +0,0 @@ -From 578cb97f41beb5d2dde81b8a4c1db6e01e8bcc6a Mon Sep 17 00:00:00 2001 -From: Oleksandr Natalenko <oleksandr@natalenko.name> -Date: Wed, 11 Dec 2024 08:50:50 +0100 -Subject: kbuild-6.12: adopt proposed upstream change for generic CPUs - -Link: https://lore.kernel.org/lkml/20241210144945.2325330-4-arnd@kernel.org/ -Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ---- - arch/x86/Makefile | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - ---- a/arch/x86/Makefile -+++ b/arch/x86/Makefile -@@ -184,8 +184,8 @@ else - cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=bonnell - ifeq ($(CONFIG_X86_64_VERSION),1) -- cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic -- rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic -+ cflags-$(CONFIG_GENERIC_CPU) += -march=x86-64 -mtune=generic -+ rustflags-$(CONFIG_GENERIC_CPU) += -Ctarget-cpu=x86-64 -Ztune-cpu=generic - else - cflags-$(CONFIG_GENERIC_CPU) += -march=x86-64-v$(CONFIG_X86_64_VERSION) - rustflags-$(CONFIG_GENERIC_CPU) += -Ctarget-cpu=x86-64-v$(CONFIG_X86_64_VERSION) diff --git a/debian/patches/patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-Modify-the-min_perf-calculation-i.patch b/debian/patches/patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-Modify-the-min_perf-calculation-i.patch new file mode 100644 index 0000000..8a2ca33 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-Modify-the-min_perf-calculation-i.patch @@ -0,0 +1,59 @@ +From b6c0305214154bc26d20b130266fc1ba8341b58c Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Wed, 5 Feb 2025 11:25:14 +0000 +Subject: cpufreq/amd-pstate: Modify the min_perf calculation in adjust_perf + callback + +Instead of setting a fixed floor at lowest_nonlinear_perf, use the +min_limit_perf value, so that it gives the user the freedom to lower the +floor further. + +There are two minimum frequency/perf limits that we need to consider in +the adjust_perf callback. One provided by schedutil i.e. the sg_cpu->bw_min +value passed in _min_perf arg, another is the effective value of +min_freq_qos request that is updated in cpudata->min_limit_perf. Modify the +code to use the bigger of these two values. + +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -672,7 +672,7 @@ static void amd_pstate_adjust_perf(unsig + unsigned long capacity) + { + unsigned long max_perf, min_perf, des_perf, +- cap_perf, lowest_nonlinear_perf; ++ cap_perf, min_limit_perf; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata; + +@@ -684,20 +684,20 @@ static void amd_pstate_adjust_perf(unsig + if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) + amd_pstate_update_min_max_limit(policy); + +- + cap_perf = READ_ONCE(cpudata->highest_perf); +- lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); ++ min_limit_perf = READ_ONCE(cpudata->min_limit_perf); + + des_perf = cap_perf; + if (target_perf < capacity) + des_perf = DIV_ROUND_UP(cap_perf * target_perf, capacity); + +- min_perf = READ_ONCE(cpudata->lowest_perf); + if (_min_perf < capacity) + min_perf = DIV_ROUND_UP(cap_perf * _min_perf, capacity); ++ else ++ min_perf = cap_perf; + +- if (min_perf < lowest_nonlinear_perf) +- min_perf = lowest_nonlinear_perf; ++ if (min_perf < min_limit_perf) ++ min_perf = min_limit_perf; + + max_perf = cpudata->max_limit_perf; + if (max_perf < min_perf) diff --git a/debian/patches/patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Remove-the-redundant-des_perf-cla.patch b/debian/patches/patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Remove-the-redundant-des_perf-cla.patch new file mode 100644 index 0000000..a448bdb --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Remove-the-redundant-des_perf-cla.patch @@ -0,0 +1,27 @@ +From 6e51c53b5e940312c71ce5ea68cf94a000beab01 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Wed, 5 Feb 2025 11:25:15 +0000 +Subject: cpufreq/amd-pstate: Remove the redundant des_perf clamping in + adjust_perf + +des_perf is later on clamped between min_perf and max_perf in +amd_pstate_update. So, remove the redundant clamping from +amd_pstate_adjust_perf. + +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -703,8 +703,6 @@ static void amd_pstate_adjust_perf(unsig + if (max_perf < min_perf) + max_perf = min_perf; + +- des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); +- + amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, + policy->governor->flags); + cpufreq_cpu_put(policy); diff --git a/debian/patches/patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-Pass-min-max_limit_perf-as-min-ma.patch b/debian/patches/patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-Pass-min-max_limit_perf-as-min-ma.patch new file mode 100644 index 0000000..fa4d3cb --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-Pass-min-max_limit_perf-as-min-ma.patch @@ -0,0 +1,51 @@ +From ad3fffe8ff1f18ad437d8b0d0bb602ba3c24adf7 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Wed, 5 Feb 2025 11:25:16 +0000 +Subject: cpufreq/amd-pstate: Pass min/max_limit_perf as min/max_perf to + amd_pstate_update + +Currently, amd_pstate_update_freq passes the hardware perf limits as +min/max_perf to amd_pstate_update, which eventually gets programmed into +the min/max_perf fields of the CPPC_REQ register. + +Instead pass the effective perf limits i.e. min/max_limit_perf values to +amd_pstate_update as min/max_perf. + +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -615,7 +615,7 @@ static int amd_pstate_update_freq(struct + { + struct cpufreq_freqs freqs; + struct amd_cpudata *cpudata = policy->driver_data; +- unsigned long max_perf, min_perf, des_perf, cap_perf; ++ unsigned long des_perf, cap_perf; + + if (!cpudata->max_freq) + return -ENODEV; +@@ -624,8 +624,6 @@ static int amd_pstate_update_freq(struct + amd_pstate_update_min_max_limit(policy); + + cap_perf = READ_ONCE(cpudata->highest_perf); +- min_perf = READ_ONCE(cpudata->lowest_perf); +- max_perf = cap_perf; + + freqs.old = policy->cur; + freqs.new = target_freq; +@@ -642,8 +640,9 @@ static int amd_pstate_update_freq(struct + if (!fast_switch) + cpufreq_freq_transition_begin(policy, &freqs); + +- amd_pstate_update(cpudata, min_perf, des_perf, +- max_perf, fast_switch, policy->governor->flags); ++ amd_pstate_update(cpudata, cpudata->min_limit_perf, des_perf, ++ cpudata->max_limit_perf, fast_switch, ++ policy->governor->flags); + + if (!fast_switch) + cpufreq_freq_transition_end(policy, &freqs, false); diff --git a/debian/patches/patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Convert-all-perf-values-to-u8.patch b/debian/patches/patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Convert-all-perf-values-to-u8.patch new file mode 100644 index 0000000..948010b --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Convert-all-perf-values-to-u8.patch @@ -0,0 +1,355 @@ +From 300686c32b77583f45c6763535da85f2242bf820 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Wed, 5 Feb 2025 11:25:17 +0000 +Subject: cpufreq/amd-pstate: Convert all perf values to u8 + +All perf values are always within 0-255 range, hence convert their +datatype to u8 everywhere. + +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate-trace.h | 46 +++++++++++------------ + drivers/cpufreq/amd-pstate.c | 60 +++++++++++++++--------------- + drivers/cpufreq/amd-pstate.h | 18 ++++----- + 3 files changed, 62 insertions(+), 62 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-trace.h ++++ b/drivers/cpufreq/amd-pstate-trace.h +@@ -24,9 +24,9 @@ + + TRACE_EVENT(amd_pstate_perf, + +- TP_PROTO(unsigned long min_perf, +- unsigned long target_perf, +- unsigned long capacity, ++ TP_PROTO(u8 min_perf, ++ u8 target_perf, ++ u8 capacity, + u64 freq, + u64 mperf, + u64 aperf, +@@ -47,9 +47,9 @@ TRACE_EVENT(amd_pstate_perf, + ), + + TP_STRUCT__entry( +- __field(unsigned long, min_perf) +- __field(unsigned long, target_perf) +- __field(unsigned long, capacity) ++ __field(u8, min_perf) ++ __field(u8, target_perf) ++ __field(u8, capacity) + __field(unsigned long long, freq) + __field(unsigned long long, mperf) + __field(unsigned long long, aperf) +@@ -70,10 +70,10 @@ TRACE_EVENT(amd_pstate_perf, + __entry->fast_switch = fast_switch; + ), + +- TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u fast_switch=%s", +- (unsigned long)__entry->min_perf, +- (unsigned long)__entry->target_perf, +- (unsigned long)__entry->capacity, ++ TP_printk("amd_min_perf=%hhu amd_des_perf=%hhu amd_max_perf=%hhu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u fast_switch=%s", ++ (u8)__entry->min_perf, ++ (u8)__entry->target_perf, ++ (u8)__entry->capacity, + (unsigned long long)__entry->freq, + (unsigned long long)__entry->mperf, + (unsigned long long)__entry->aperf, +@@ -86,10 +86,10 @@ TRACE_EVENT(amd_pstate_perf, + TRACE_EVENT(amd_pstate_epp_perf, + + TP_PROTO(unsigned int cpu_id, +- unsigned int highest_perf, +- unsigned int epp, +- unsigned int min_perf, +- unsigned int max_perf, ++ u8 highest_perf, ++ u8 epp, ++ u8 min_perf, ++ u8 max_perf, + bool boost + ), + +@@ -102,10 +102,10 @@ TRACE_EVENT(amd_pstate_epp_perf, + + TP_STRUCT__entry( + __field(unsigned int, cpu_id) +- __field(unsigned int, highest_perf) +- __field(unsigned int, epp) +- __field(unsigned int, min_perf) +- __field(unsigned int, max_perf) ++ __field(u8, highest_perf) ++ __field(u8, epp) ++ __field(u8, min_perf) ++ __field(u8, max_perf) + __field(bool, boost) + ), + +@@ -118,12 +118,12 @@ TRACE_EVENT(amd_pstate_epp_perf, + __entry->boost = boost; + ), + +- TP_printk("cpu%u: [%u<->%u]/%u, epp=%u, boost=%u", ++ TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u", + (unsigned int)__entry->cpu_id, +- (unsigned int)__entry->min_perf, +- (unsigned int)__entry->max_perf, +- (unsigned int)__entry->highest_perf, +- (unsigned int)__entry->epp, ++ (u8)__entry->min_perf, ++ (u8)__entry->max_perf, ++ (u8)__entry->highest_perf, ++ (u8)__entry->epp, + (bool)__entry->boost + ) + ); +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -186,7 +186,7 @@ static inline int get_mode_idx_from_str( + static DEFINE_MUTEX(amd_pstate_limits_lock); + static DEFINE_MUTEX(amd_pstate_driver_lock); + +-static s16 msr_get_epp(struct amd_cpudata *cpudata) ++static u8 msr_get_epp(struct amd_cpudata *cpudata) + { + u64 value; + int ret; +@@ -207,7 +207,7 @@ static inline s16 amd_pstate_get_epp(str + return static_call(amd_pstate_get_epp)(cpudata); + } + +-static s16 shmem_get_epp(struct amd_cpudata *cpudata) ++static u8 shmem_get_epp(struct amd_cpudata *cpudata) + { + u64 epp; + int ret; +@@ -218,11 +218,11 @@ static s16 shmem_get_epp(struct amd_cpud + return ret; + } + +- return (s16)(epp & 0xff); ++ return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, epp); + } + +-static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, +- u32 des_perf, u32 max_perf, u32 epp, bool fast_switch) ++static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf, ++ u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) + { + u64 value, prev; + +@@ -257,15 +257,15 @@ static int msr_update_perf(struct amd_cp + DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); + + static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata, +- u32 min_perf, u32 des_perf, +- u32 max_perf, u32 epp, ++ u8 min_perf, u8 des_perf, ++ u8 max_perf, u8 epp, + bool fast_switch) + { + return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, + max_perf, epp, fast_switch); + } + +-static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) ++static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) + { + u64 value, prev; + int ret; +@@ -292,12 +292,12 @@ static int msr_set_epp(struct amd_cpudat + + DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp); + +-static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) ++static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u8 epp) + { + return static_call(amd_pstate_set_epp)(cpudata, epp); + } + +-static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp) ++static int shmem_set_epp(struct amd_cpudata *cpudata, u8 epp) + { + int ret; + struct cppc_perf_ctrls perf_ctrls; +@@ -320,7 +320,7 @@ static int amd_pstate_set_energy_pref_in + int pref_index) + { + struct amd_cpudata *cpudata = policy->driver_data; +- int epp; ++ u8 epp; + + if (!pref_index) + epp = cpudata->epp_default; +@@ -479,8 +479,8 @@ static inline int amd_pstate_init_perf(s + return static_call(amd_pstate_init_perf)(cpudata); + } + +-static int shmem_update_perf(struct amd_cpudata *cpudata, u32 min_perf, +- u32 des_perf, u32 max_perf, u32 epp, bool fast_switch) ++static int shmem_update_perf(struct amd_cpudata *cpudata, u8 min_perf, ++ u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) + { + struct cppc_perf_ctrls perf_ctrls; + +@@ -531,14 +531,14 @@ static inline bool amd_pstate_sample(str + return true; + } + +-static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, +- u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags) ++static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, ++ u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) + { + unsigned long max_freq; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); +- u32 nominal_perf = READ_ONCE(cpudata->nominal_perf); ++ u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); + +- des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); ++ des_perf = clamp_t(u8, des_perf, min_perf, max_perf); + + max_freq = READ_ONCE(cpudata->max_limit_freq); + policy->cur = div_u64(des_perf * max_freq, max_perf); +@@ -550,7 +550,7 @@ static void amd_pstate_update(struct amd + + /* limit the max perf when core performance boost feature is disabled */ + if (!cpudata->boost_supported) +- max_perf = min_t(unsigned long, nominal_perf, max_perf); ++ max_perf = min_t(u8, nominal_perf, max_perf); + + if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) { + trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq, +@@ -591,7 +591,8 @@ static int amd_pstate_verify(struct cpuf + + static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) + { +- u32 max_limit_perf, min_limit_perf, max_perf, max_freq; ++ u8 max_limit_perf, min_limit_perf, max_perf; ++ u32 max_freq; + struct amd_cpudata *cpudata = policy->driver_data; + + max_perf = READ_ONCE(cpudata->highest_perf); +@@ -615,7 +616,7 @@ static int amd_pstate_update_freq(struct + { + struct cpufreq_freqs freqs; + struct amd_cpudata *cpudata = policy->driver_data; +- unsigned long des_perf, cap_perf; ++ u8 des_perf, cap_perf; + + if (!cpudata->max_freq) + return -ENODEV; +@@ -670,8 +671,7 @@ static void amd_pstate_adjust_perf(unsig + unsigned long target_perf, + unsigned long capacity) + { +- unsigned long max_perf, min_perf, des_perf, +- cap_perf, min_limit_perf; ++ u8 max_perf, min_perf, des_perf, cap_perf, min_limit_perf; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata; + +@@ -905,8 +905,8 @@ static int amd_pstate_init_freq(struct a + { + int ret; + u32 min_freq, max_freq; +- u32 highest_perf, nominal_perf, nominal_freq; +- u32 lowest_nonlinear_perf, lowest_nonlinear_freq; ++ u8 highest_perf, nominal_perf, lowest_nonlinear_perf; ++ u32 nominal_freq, lowest_nonlinear_freq; + struct cppc_perf_caps cppc_perf; + + ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); +@@ -1113,7 +1113,7 @@ static ssize_t show_amd_pstate_lowest_no + static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, + char *buf) + { +- u32 perf; ++ u8 perf; + struct amd_cpudata *cpudata = policy->driver_data; + + perf = READ_ONCE(cpudata->highest_perf); +@@ -1124,7 +1124,7 @@ static ssize_t show_amd_pstate_highest_p + static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy, + char *buf) + { +- u32 perf; ++ u8 perf; + struct amd_cpudata *cpudata = policy->driver_data; + + perf = READ_ONCE(cpudata->prefcore_ranking); +@@ -1187,7 +1187,7 @@ static ssize_t show_energy_performance_p + struct cpufreq_policy *policy, char *buf) + { + struct amd_cpudata *cpudata = policy->driver_data; +- int preference; ++ u8 preference; + + switch (cpudata->epp_cached) { + case AMD_CPPC_EPP_PERFORMANCE: +@@ -1549,7 +1549,7 @@ static void amd_pstate_epp_cpu_exit(stru + static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- u32 epp; ++ u8 epp; + + amd_pstate_update_min_max_limit(policy); + +@@ -1598,7 +1598,7 @@ static int amd_pstate_epp_set_policy(str + static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- u64 max_perf; ++ u8 max_perf; + int ret; + + ret = amd_pstate_cppc_enable(true); +@@ -1635,7 +1635,7 @@ static int amd_pstate_epp_cpu_online(str + static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- int min_perf; ++ u8 min_perf; + + if (cpudata->suspended) + return 0; +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -70,13 +70,13 @@ struct amd_cpudata { + struct freq_qos_request req[2]; + u64 cppc_req_cached; + +- u32 highest_perf; +- u32 nominal_perf; +- u32 lowest_nonlinear_perf; +- u32 lowest_perf; +- u32 prefcore_ranking; +- u32 min_limit_perf; +- u32 max_limit_perf; ++ u8 highest_perf; ++ u8 nominal_perf; ++ u8 lowest_nonlinear_perf; ++ u8 lowest_perf; ++ u8 prefcore_ranking; ++ u8 min_limit_perf; ++ u8 max_limit_perf; + u32 min_limit_freq; + u32 max_limit_freq; + +@@ -93,11 +93,11 @@ struct amd_cpudata { + bool hw_prefcore; + + /* EPP feature related attributes*/ +- s16 epp_cached; ++ u8 epp_cached; + u32 policy; + u64 cppc_cap1_cached; + bool suspended; +- s16 epp_default; ++ u8 epp_default; + }; + + /* diff --git a/debian/patches/patchset-pf/amd-pstate/0005-cpufreq-amd-pstate-Modularize-perf-freq-conversion.patch b/debian/patches/patchset-pf/amd-pstate/0005-cpufreq-amd-pstate-Modularize-perf-freq-conversion.patch new file mode 100644 index 0000000..38fe48a --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0005-cpufreq-amd-pstate-Modularize-perf-freq-conversion.patch @@ -0,0 +1,131 @@ +From 8b87350a2e336e54b4d2638ac042bb2f7416312a Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Wed, 5 Feb 2025 11:25:18 +0000 +Subject: cpufreq/amd-pstate: Modularize perf<->freq conversion + +Delegate the perf<->frequency conversion to helper functions to reduce +code duplication, and improve readability. + +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 57 +++++++++++++++++++----------------- + 1 file changed, 30 insertions(+), 27 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -142,6 +142,20 @@ static struct quirk_entry quirk_amd_7k62 + .lowest_freq = 550, + }; + ++static inline u8 freq_to_perf(struct amd_cpudata *cpudata, unsigned int freq_val) ++{ ++ u8 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf, ++ cpudata->nominal_freq); ++ ++ return clamp_t(u8, perf_val, cpudata->lowest_perf, cpudata->highest_perf); ++} ++ ++static inline u32 perf_to_freq(struct amd_cpudata *cpudata, u8 perf_val) ++{ ++ return DIV_ROUND_UP_ULL((u64)cpudata->nominal_freq * perf_val, ++ cpudata->nominal_perf); ++} ++ + static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) + { + /** +@@ -534,14 +548,12 @@ static inline bool amd_pstate_sample(str + static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, + u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) + { +- unsigned long max_freq; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); + u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); + + des_perf = clamp_t(u8, des_perf, min_perf, max_perf); + +- max_freq = READ_ONCE(cpudata->max_limit_freq); +- policy->cur = div_u64(des_perf * max_freq, max_perf); ++ policy->cur = perf_to_freq(cpudata, des_perf); + + if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) { + min_perf = des_perf; +@@ -591,14 +603,11 @@ static int amd_pstate_verify(struct cpuf + + static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) + { +- u8 max_limit_perf, min_limit_perf, max_perf; +- u32 max_freq; ++ u8 max_limit_perf, min_limit_perf; + struct amd_cpudata *cpudata = policy->driver_data; + +- max_perf = READ_ONCE(cpudata->highest_perf); +- max_freq = READ_ONCE(cpudata->max_freq); +- max_limit_perf = div_u64(policy->max * max_perf, max_freq); +- min_limit_perf = div_u64(policy->min * max_perf, max_freq); ++ max_limit_perf = freq_to_perf(cpudata, policy->max); ++ min_limit_perf = freq_to_perf(cpudata, policy->min); + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + min_limit_perf = min(cpudata->nominal_perf, max_limit_perf); +@@ -616,21 +625,15 @@ static int amd_pstate_update_freq(struct + { + struct cpufreq_freqs freqs; + struct amd_cpudata *cpudata = policy->driver_data; +- u8 des_perf, cap_perf; +- +- if (!cpudata->max_freq) +- return -ENODEV; ++ u8 des_perf; + + if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) + amd_pstate_update_min_max_limit(policy); + +- cap_perf = READ_ONCE(cpudata->highest_perf); +- + freqs.old = policy->cur; + freqs.new = target_freq; + +- des_perf = DIV_ROUND_CLOSEST(target_freq * cap_perf, +- cpudata->max_freq); ++ des_perf = freq_to_perf(cpudata, target_freq); + + WARN_ON(fast_switch && !policy->fast_switch_enabled); + /* +@@ -905,7 +908,6 @@ static int amd_pstate_init_freq(struct a + { + int ret; + u32 min_freq, max_freq; +- u8 highest_perf, nominal_perf, lowest_nonlinear_perf; + u32 nominal_freq, lowest_nonlinear_freq; + struct cppc_perf_caps cppc_perf; + +@@ -923,16 +925,17 @@ static int amd_pstate_init_freq(struct a + else + nominal_freq = cppc_perf.nominal_freq; + +- highest_perf = READ_ONCE(cpudata->highest_perf); +- nominal_perf = READ_ONCE(cpudata->nominal_perf); +- max_freq = div_u64((u64)highest_perf * nominal_freq, nominal_perf); +- +- lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); +- lowest_nonlinear_freq = div_u64((u64)nominal_freq * lowest_nonlinear_perf, nominal_perf); +- WRITE_ONCE(cpudata->min_freq, min_freq * 1000); +- WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq * 1000); +- WRITE_ONCE(cpudata->nominal_freq, nominal_freq * 1000); +- WRITE_ONCE(cpudata->max_freq, max_freq * 1000); ++ min_freq *= 1000; ++ nominal_freq *= 1000; ++ ++ WRITE_ONCE(cpudata->nominal_freq, nominal_freq); ++ WRITE_ONCE(cpudata->min_freq, min_freq); ++ ++ max_freq = perf_to_freq(cpudata, cpudata->highest_perf); ++ lowest_nonlinear_freq = perf_to_freq(cpudata, cpudata->lowest_nonlinear_perf); ++ ++ WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); ++ WRITE_ONCE(cpudata->max_freq, max_freq); + + /** + * Below values need to be initialized correctly, otherwise driver will fail to load diff --git a/debian/patches/patchset-pf/amd-pstate/0006-cpufreq-amd-pstate-Remove-the-unnecessary-cpufreq_up.patch b/debian/patches/patchset-pf/amd-pstate/0006-cpufreq-amd-pstate-Remove-the-unnecessary-cpufreq_up.patch new file mode 100644 index 0000000..61df8f8 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0006-cpufreq-amd-pstate-Remove-the-unnecessary-cpufreq_up.patch @@ -0,0 +1,37 @@ +From b638a74c3b16e0781bb25478c135726862c9271d Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Wed, 5 Feb 2025 11:25:19 +0000 +Subject: cpufreq/amd-pstate: Remove the unnecessary cpufreq_update_policy call + +The update_limits callback is only called in two conditions. + +* When the preferred core rankings change. In which case, we just need to +change the prefcore ranking in the cpudata struct. As there are no changes +to any of the perf values, there is no need to call cpufreq_update_policy() + +* When the _PPC ACPI object changes, i.e. the highest allowed Pstate +changes. The _PPC object is only used for a table based cpufreq driver +like acpi-cpufreq, hence is irrelevant for CPPC based amd-pstate. + +Hence, the cpufreq_update_policy() call becomes unnecessary and can be +removed. + +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -853,10 +853,6 @@ static void amd_pstate_update_limits(uns + sched_set_itmt_core_prio((int)cur_high, cpu); + } + cpufreq_cpu_put(policy); +- +- if (!highest_perf_changed) +- cpufreq_update_policy(cpu); +- + } + + /* diff --git a/debian/patches/patchset-pf/amd-pstate/0007-cpufreq-amd-pstate-Add-missing-NULL-ptr-check-in-amd.patch b/debian/patches/patchset-pf/amd-pstate/0007-cpufreq-amd-pstate-Add-missing-NULL-ptr-check-in-amd.patch new file mode 100644 index 0000000..08bfbc6 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0007-cpufreq-amd-pstate-Add-missing-NULL-ptr-check-in-amd.patch @@ -0,0 +1,26 @@ +From 156278367fd2c0863dc06f9a7df0a654ae336726 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Wed, 5 Feb 2025 11:25:21 +0000 +Subject: cpufreq/amd-pstate: Add missing NULL ptr check in amd_pstate_update + +Check if policy is NULL before dereferencing it in amd_pstate_update. + +Fixes: e8f555daacd3 ("cpufreq/amd-pstate: fix setting policy current frequency value") +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -551,6 +551,9 @@ static void amd_pstate_update(struct amd + struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); + u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); + ++ if (!policy) ++ return; ++ + des_perf = clamp_t(u8, des_perf, min_perf, max_perf); + + policy->cur = perf_to_freq(cpudata, des_perf); diff --git a/debian/patches/patchset-pf/amd-pstate/0008-cpufreq-amd-pstate-Use-scope-based-cleanup-for-cpufr.patch b/debian/patches/patchset-pf/amd-pstate/0008-cpufreq-amd-pstate-Use-scope-based-cleanup-for-cpufr.patch new file mode 100644 index 0000000..a3311ce --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0008-cpufreq-amd-pstate-Use-scope-based-cleanup-for-cpufr.patch @@ -0,0 +1,124 @@ +From e36868a11daa43eff94abd32f19b1783e89298d4 Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Wed, 5 Feb 2025 11:25:22 +0000 +Subject: cpufreq/amd-pstate: Use scope based cleanup for cpufreq_policy refs + +There have been instances in past where refcount decrementing is missed +while exiting a function. Use automatic scope based cleanup to avoid +such errors. + +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 25 ++++++++----------------- + include/linux/cpufreq.h | 3 +++ + 2 files changed, 11 insertions(+), 17 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -548,7 +548,7 @@ static inline bool amd_pstate_sample(str + static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, + u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) + { +- struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); ++ struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); + u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); + + if (!policy) +@@ -574,8 +574,6 @@ static void amd_pstate_update(struct amd + } + + amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch); +- +- cpufreq_cpu_put(policy); + } + + static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) +@@ -587,7 +585,8 @@ static int amd_pstate_verify(struct cpuf + * amd-pstate qos_requests. + */ + if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) { +- struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu); ++ struct cpufreq_policy *policy __free(put_cpufreq_policy) = ++ cpufreq_cpu_get(policy_data->cpu); + struct amd_cpudata *cpudata; + + if (!policy) +@@ -595,7 +594,6 @@ static int amd_pstate_verify(struct cpuf + + cpudata = policy->driver_data; + policy_data->min = cpudata->lowest_nonlinear_freq; +- cpufreq_cpu_put(policy); + } + + cpufreq_verify_within_cpu_limits(policy_data); +@@ -678,7 +676,7 @@ static void amd_pstate_adjust_perf(unsig + unsigned long capacity) + { + u8 max_perf, min_perf, des_perf, cap_perf, min_limit_perf; +- struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); ++ struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata; + + if (!policy) +@@ -710,7 +708,6 @@ static void amd_pstate_adjust_perf(unsig + + amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, + policy->governor->flags); +- cpufreq_cpu_put(policy); + } + + static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) +@@ -824,28 +821,23 @@ static void amd_pstate_init_prefcore(str + + static void amd_pstate_update_limits(unsigned int cpu) + { +- struct cpufreq_policy *policy = NULL; ++ struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata; + u32 prev_high = 0, cur_high = 0; +- int ret; + bool highest_perf_changed = false; + + if (!amd_pstate_prefcore) + return; + +- policy = cpufreq_cpu_get(cpu); + if (!policy) + return; + +- cpudata = policy->driver_data; +- + guard(mutex)(&amd_pstate_driver_lock); + +- ret = amd_get_highest_perf(cpu, &cur_high); +- if (ret) { +- cpufreq_cpu_put(policy); ++ if (amd_get_highest_perf(cpu, &cur_high)) + return; +- } ++ ++ cpudata = policy->driver_data; + + prev_high = READ_ONCE(cpudata->prefcore_ranking); + highest_perf_changed = (prev_high != cur_high); +@@ -855,7 +847,6 @@ static void amd_pstate_update_limits(uns + if (cur_high < CPPC_MAX_PERF) + sched_set_itmt_core_prio((int)cur_high, cpu); + } +- cpufreq_cpu_put(policy); + } + + /* +--- a/include/linux/cpufreq.h ++++ b/include/linux/cpufreq.h +@@ -210,6 +210,9 @@ static inline struct cpufreq_policy *cpu + static inline void cpufreq_cpu_put(struct cpufreq_policy *policy) { } + #endif + ++/* Scope based cleanup macro for cpufreq_policy kobject reference counting */ ++DEFINE_FREE(put_cpufreq_policy, struct cpufreq_policy *, if (_T) cpufreq_cpu_put(_T)) ++ + static inline bool policy_is_inactive(struct cpufreq_policy *policy) + { + return cpumask_empty(policy->cpus); diff --git a/debian/patches/patchset-pf/amd-pstate/0009-cpufreq-amd-pstate-Remove-the-unncecessary-driver_lo.patch b/debian/patches/patchset-pf/amd-pstate/0009-cpufreq-amd-pstate-Remove-the-unncecessary-driver_lo.patch new file mode 100644 index 0000000..9480fd8 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0009-cpufreq-amd-pstate-Remove-the-unncecessary-driver_lo.patch @@ -0,0 +1,26 @@ +From 9b7b7d59c5425246ffda281e761ef3ec3b0e4fbc Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Wed, 5 Feb 2025 11:25:23 +0000 +Subject: cpufreq/amd-pstate: Remove the unncecessary driver_lock in + amd_pstate_update_limits + +There is no need to take a driver wide lock while updating the +highest_perf value in the percpu cpudata struct. Hence remove it. + +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -832,8 +832,6 @@ static void amd_pstate_update_limits(uns + if (!policy) + return; + +- guard(mutex)(&amd_pstate_driver_lock); +- + if (amd_get_highest_perf(cpu, &cur_high)) + return; + diff --git a/debian/patches/patchset-pf/amd-pstate/0010-cpufreq-amd-pstate-Fix-the-clamping-of-perf-values.patch b/debian/patches/patchset-pf/amd-pstate/0010-cpufreq-amd-pstate-Fix-the-clamping-of-perf-values.patch new file mode 100644 index 0000000..181ed72 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0010-cpufreq-amd-pstate-Fix-the-clamping-of-perf-values.patch @@ -0,0 +1,35 @@ +From f09ef5b8aacd5b16ac1ea93103b41a7e88b174ed Mon Sep 17 00:00:00 2001 +From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Date: Sat, 22 Feb 2025 03:32:22 +0000 +Subject: cpufreq/amd-pstate: Fix the clamping of perf values + +The clamping in freq_to_perf() is broken right now, as we first typecast +(read wraparound) the overflowing value into a u8 and then clamp it down. +So, use a u32 to store the >255 value in certain edge cases and then clamp +it down into a u8. + +Also, use a "explicit typecast + clamp" instead of just a "clamp_t" as the +latter typecasts first and then clamps between the limits, which defeats +our purpose. + +Fixes: 305621eb6a8b ("cpufreq/amd-pstate: Modularize perf<->freq conversion") +Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -144,10 +144,10 @@ static struct quirk_entry quirk_amd_7k62 + + static inline u8 freq_to_perf(struct amd_cpudata *cpudata, unsigned int freq_val) + { +- u8 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf, ++ u32 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf, + cpudata->nominal_freq); + +- return clamp_t(u8, perf_val, cpudata->lowest_perf, cpudata->highest_perf); ++ return (u8)clamp(perf_val, cpudata->lowest_perf, cpudata->highest_perf); + } + + static inline u32 perf_to_freq(struct amd_cpudata *cpudata, u8 perf_val) diff --git a/debian/patches/patchset-pf/amd-pstate/0011-cpufreq-amd-pstate-Invalidate-cppc_req_cached-during.patch b/debian/patches/patchset-pf/amd-pstate/0011-cpufreq-amd-pstate-Invalidate-cppc_req_cached-during.patch new file mode 100644 index 0000000..a301e9c --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0011-cpufreq-amd-pstate-Invalidate-cppc_req_cached-during.patch @@ -0,0 +1,42 @@ +From 210d043d7b244588c911e355f2d5339bda9c8209 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:16 -0600 +Subject: cpufreq/amd-pstate: Invalidate cppc_req_cached during suspend + +During resume it's possible the firmware didn't restore the CPPC request +MSR but the kernel thinks the values line up. This leads to incorrect +performance after resume from suspend. + +To fix the issue invalidate the cached value at suspend. During resume use +the saved values programmed as cached limits. + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reported-by: Miroslav Pavleski <miroslav@pavleski.net> +Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217931 +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1605,7 +1605,7 @@ static int amd_pstate_epp_reenable(struc + max_perf, policy->boost_enabled); + } + +- return amd_pstate_update_perf(cpudata, 0, 0, max_perf, cpudata->epp_cached, false); ++ return amd_pstate_epp_update_limit(policy); + } + + static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) +@@ -1654,6 +1654,9 @@ static int amd_pstate_epp_suspend(struct + if (cppc_state != AMD_PSTATE_ACTIVE) + return 0; + ++ /* invalidate to ensure it's rewritten during resume */ ++ cpudata->cppc_req_cached = 0; ++ + /* set this flag to avoid setting core offline*/ + cpudata->suspended = true; + diff --git a/debian/patches/patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Show-a-warning-when-a-CPU-fails-t.patch b/debian/patches/patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Show-a-warning-when-a-CPU-fails-t.patch new file mode 100644 index 0000000..c6f5e36 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Show-a-warning-when-a-CPU-fails-t.patch @@ -0,0 +1,35 @@ +From a0233b8c2c01e98ddeb2e80768d4c7172311b200 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:17 -0600 +Subject: cpufreq/amd-pstate: Show a warning when a CPU fails to setup + +I came across a system that MSR_AMD_CPPC_CAP1 for some CPUs isn't +populated. This is an unexpected behavior that is most likely a +BIOS bug. In the event it happens I'd like users to report bugs +to properly root cause and get this fixed. + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1028,6 +1028,7 @@ static int amd_pstate_cpu_init(struct cp + free_cpudata2: + freq_qos_remove_request(&cpudata->req[0]); + free_cpudata1: ++ pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret); + kfree(cpudata); + return ret; + } +@@ -1521,6 +1522,7 @@ static int amd_pstate_epp_cpu_init(struc + return 0; + + free_cpudata1: ++ pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret); + kfree(cpudata); + return ret; + } diff --git a/debian/patches/patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Drop-min-and-max-cached-frequenci.patch b/debian/patches/patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Drop-min-and-max-cached-frequenci.patch new file mode 100644 index 0000000..dbb540e --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Drop-min-and-max-cached-frequenci.patch @@ -0,0 +1,209 @@ +From ad672c3336331cab028c27e4a73153f517bb1844 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:18 -0600 +Subject: cpufreq/amd-pstate: Drop min and max cached frequencies + +Use the perf_to_freq helpers to calculate this on the fly. +As the members are no longer cached add an extra check into +amd_pstate_epp_update_limit() to avoid unnecessary calls in +amd_pstate_update_min_max_limit(). + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate-ut.c | 14 +++++------ + drivers/cpufreq/amd-pstate.c | 43 +++++++++------------------------ + drivers/cpufreq/amd-pstate.h | 9 ++----- + 3 files changed, 20 insertions(+), 46 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -214,14 +214,14 @@ static void amd_pstate_ut_check_freq(u32 + break; + cpudata = policy->driver_data; + +- if (!((cpudata->max_freq >= cpudata->nominal_freq) && ++ if (!((policy->cpuinfo.max_freq >= cpudata->nominal_freq) && + (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) && +- (cpudata->lowest_nonlinear_freq > cpudata->min_freq) && +- (cpudata->min_freq > 0))) { ++ (cpudata->lowest_nonlinear_freq > policy->cpuinfo.min_freq) && ++ (policy->cpuinfo.min_freq > 0))) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", +- __func__, cpu, cpudata->max_freq, cpudata->nominal_freq, +- cpudata->lowest_nonlinear_freq, cpudata->min_freq); ++ __func__, cpu, policy->cpuinfo.max_freq, cpudata->nominal_freq, ++ cpudata->lowest_nonlinear_freq, policy->cpuinfo.min_freq); + goto skip_test; + } + +@@ -233,13 +233,13 @@ static void amd_pstate_ut_check_freq(u32 + } + + if (cpudata->boost_supported) { +- if ((policy->max == cpudata->max_freq) || ++ if ((policy->max == policy->cpuinfo.max_freq) || + (policy->max == cpudata->nominal_freq)) + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; + else { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", +- __func__, cpu, policy->max, cpudata->max_freq, ++ __func__, cpu, policy->max, policy->cpuinfo.max_freq, + cpudata->nominal_freq); + goto skip_test; + } +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -717,7 +717,7 @@ static int amd_pstate_cpu_boost_update(s + int ret = 0; + + nominal_freq = READ_ONCE(cpudata->nominal_freq); +- max_freq = READ_ONCE(cpudata->max_freq); ++ max_freq = perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf)); + + if (on) + policy->cpuinfo.max_freq = max_freq; +@@ -917,13 +917,10 @@ static int amd_pstate_init_freq(struct a + nominal_freq *= 1000; + + WRITE_ONCE(cpudata->nominal_freq, nominal_freq); +- WRITE_ONCE(cpudata->min_freq, min_freq); + + max_freq = perf_to_freq(cpudata, cpudata->highest_perf); + lowest_nonlinear_freq = perf_to_freq(cpudata, cpudata->lowest_nonlinear_perf); +- + WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); +- WRITE_ONCE(cpudata->max_freq, max_freq); + + /** + * Below values need to be initialized correctly, otherwise driver will fail to load +@@ -948,9 +945,9 @@ static int amd_pstate_init_freq(struct a + + static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + { +- int min_freq, max_freq, ret; +- struct device *dev; + struct amd_cpudata *cpudata; ++ struct device *dev; ++ int ret; + + /* + * Resetting PERF_CTL_MSR will put the CPU in P0 frequency, +@@ -981,17 +978,11 @@ static int amd_pstate_cpu_init(struct cp + if (ret) + goto free_cpudata1; + +- min_freq = READ_ONCE(cpudata->min_freq); +- max_freq = READ_ONCE(cpudata->max_freq); +- + policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu); + policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu); + +- policy->min = min_freq; +- policy->max = max_freq; +- +- policy->cpuinfo.min_freq = min_freq; +- policy->cpuinfo.max_freq = max_freq; ++ policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf); ++ policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf); + + policy->boost_enabled = READ_ONCE(cpudata->boost_supported); + +@@ -1015,9 +1006,6 @@ static int amd_pstate_cpu_init(struct cp + goto free_cpudata2; + } + +- cpudata->max_limit_freq = max_freq; +- cpudata->min_limit_freq = min_freq; +- + policy->driver_data = cpudata; + + if (!current_pstate_driver->adjust_perf) +@@ -1075,14 +1063,10 @@ static int amd_pstate_cpu_suspend(struct + static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, + char *buf) + { +- int max_freq; + struct amd_cpudata *cpudata = policy->driver_data; + +- max_freq = READ_ONCE(cpudata->max_freq); +- if (max_freq < 0) +- return max_freq; + +- return sysfs_emit(buf, "%u\n", max_freq); ++ return sysfs_emit(buf, "%u\n", perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf))); + } + + static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, +@@ -1440,10 +1424,10 @@ static bool amd_pstate_acpi_pm_profile_u + + static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + { +- int min_freq, max_freq, ret; + struct amd_cpudata *cpudata; + struct device *dev; + u64 value; ++ int ret; + + /* + * Resetting PERF_CTL_MSR will put the CPU in P0 frequency, +@@ -1474,19 +1458,13 @@ static int amd_pstate_epp_cpu_init(struc + if (ret) + goto free_cpudata1; + +- min_freq = READ_ONCE(cpudata->min_freq); +- max_freq = READ_ONCE(cpudata->max_freq); +- +- policy->cpuinfo.min_freq = min_freq; +- policy->cpuinfo.max_freq = max_freq; ++ policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf); ++ policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf); + /* It will be updated by governor */ + policy->cur = policy->cpuinfo.min_freq; + + policy->driver_data = cpudata; + +- policy->min = policy->cpuinfo.min_freq; +- policy->max = policy->cpuinfo.max_freq; +- + policy->boost_enabled = READ_ONCE(cpudata->boost_supported); + + /* +@@ -1544,7 +1522,8 @@ static int amd_pstate_epp_update_limit(s + struct amd_cpudata *cpudata = policy->driver_data; + u8 epp; + +- amd_pstate_update_min_max_limit(policy); ++ if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) ++ amd_pstate_update_min_max_limit(policy); + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + epp = 0; +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -46,8 +46,6 @@ struct amd_aperf_mperf { + * @max_limit_perf: Cached value of the performance corresponding to policy->max + * @min_limit_freq: Cached value of policy->min (in khz) + * @max_limit_freq: Cached value of policy->max (in khz) +- * @max_freq: the frequency (in khz) that mapped to highest_perf +- * @min_freq: the frequency (in khz) that mapped to lowest_perf + * @nominal_freq: the frequency (in khz) that mapped to nominal_perf + * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf + * @cur: Difference of Aperf/Mperf/tsc count between last and current sample +@@ -77,11 +75,8 @@ struct amd_cpudata { + u8 prefcore_ranking; + u8 min_limit_perf; + u8 max_limit_perf; +- u32 min_limit_freq; +- u32 max_limit_freq; +- +- u32 max_freq; +- u32 min_freq; ++ u32 min_limit_freq; ++ u32 max_limit_freq; + u32 nominal_freq; + u32 lowest_nonlinear_freq; + diff --git a/debian/patches/patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Move-perf-values-into-a-union.patch b/debian/patches/patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Move-perf-values-into-a-union.patch new file mode 100644 index 0000000..f925f90 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Move-perf-values-into-a-union.patch @@ -0,0 +1,611 @@ +From b96076ada115f25a4944f6f111b22c44a5d1a3cf Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:19 -0600 +Subject: cpufreq/amd-pstate: Move perf values into a union + +By storing perf values in a union all the writes and reads can +be done atomically, removing the need for some concurrency protections. + +While making this change, also drop the cached frequency values, +using inline helpers to calculate them on demand from perf value. + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate-ut.c | 18 +-- + drivers/cpufreq/amd-pstate.c | 205 ++++++++++++++++++-------------- + drivers/cpufreq/amd-pstate.h | 51 +++++--- + 3 files changed, 158 insertions(+), 116 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -129,6 +129,7 @@ static void amd_pstate_ut_check_perf(u32 + struct cppc_perf_caps cppc_perf; + struct cpufreq_policy *policy = NULL; + struct amd_cpudata *cpudata = NULL; ++ union perf_cached cur_perf; + + for_each_possible_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); +@@ -162,19 +163,20 @@ static void amd_pstate_ut_check_perf(u32 + lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); + } + +- if (highest_perf != READ_ONCE(cpudata->highest_perf) && !cpudata->hw_prefcore) { ++ cur_perf = READ_ONCE(cpudata->perf); ++ if (highest_perf != cur_perf.highest_perf && !cpudata->hw_prefcore) { + pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n", +- __func__, cpu, highest_perf, cpudata->highest_perf); ++ __func__, cpu, highest_perf, cur_perf.highest_perf); + goto skip_test; + } +- if ((nominal_perf != READ_ONCE(cpudata->nominal_perf)) || +- (lowest_nonlinear_perf != READ_ONCE(cpudata->lowest_nonlinear_perf)) || +- (lowest_perf != READ_ONCE(cpudata->lowest_perf))) { ++ if (nominal_perf != cur_perf.nominal_perf || ++ (lowest_nonlinear_perf != cur_perf.lowest_nonlinear_perf) || ++ (lowest_perf != cur_perf.lowest_perf)) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d nominal=%d %d lowest_nonlinear=%d %d lowest=%d %d, they should be equal!\n", +- __func__, cpu, nominal_perf, cpudata->nominal_perf, +- lowest_nonlinear_perf, cpudata->lowest_nonlinear_perf, +- lowest_perf, cpudata->lowest_perf); ++ __func__, cpu, nominal_perf, cur_perf.nominal_perf, ++ lowest_nonlinear_perf, cur_perf.lowest_nonlinear_perf, ++ lowest_perf, cur_perf.lowest_perf); + goto skip_test; + } + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -142,18 +142,17 @@ static struct quirk_entry quirk_amd_7k62 + .lowest_freq = 550, + }; + +-static inline u8 freq_to_perf(struct amd_cpudata *cpudata, unsigned int freq_val) ++static inline u8 freq_to_perf(union perf_cached perf, u32 nominal_freq, unsigned int freq_val) + { +- u32 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf, +- cpudata->nominal_freq); ++ u32 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * perf.nominal_perf, nominal_freq); + +- return (u8)clamp(perf_val, cpudata->lowest_perf, cpudata->highest_perf); ++ return (u8)clamp(perf_val, perf.lowest_perf, perf.highest_perf); + } + +-static inline u32 perf_to_freq(struct amd_cpudata *cpudata, u8 perf_val) ++static inline u32 perf_to_freq(union perf_cached perf, u32 nominal_freq, u8 perf_val) + { +- return DIV_ROUND_UP_ULL((u64)cpudata->nominal_freq * perf_val, +- cpudata->nominal_perf); ++ return DIV_ROUND_UP_ULL((u64)nominal_freq * perf_val, ++ perf.nominal_perf); + } + + static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) +@@ -347,7 +346,9 @@ static int amd_pstate_set_energy_pref_in + } + + if (trace_amd_pstate_epp_perf_enabled()) { +- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, ++ union perf_cached perf = READ_ONCE(cpudata->perf); ++ ++ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, + epp, + FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), +@@ -425,6 +426,7 @@ static inline int amd_pstate_cppc_enable + + static int msr_init_perf(struct amd_cpudata *cpudata) + { ++ union perf_cached perf = READ_ONCE(cpudata->perf); + u64 cap1, numerator; + + int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, +@@ -436,19 +438,21 @@ static int msr_init_perf(struct amd_cpud + if (ret) + return ret; + +- WRITE_ONCE(cpudata->highest_perf, numerator); +- WRITE_ONCE(cpudata->max_limit_perf, numerator); +- WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); +- WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); +- WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); ++ perf.highest_perf = numerator; ++ perf.max_limit_perf = numerator; ++ perf.min_limit_perf = AMD_CPPC_LOWEST_PERF(cap1); ++ perf.nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1); ++ perf.lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1); ++ perf.lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); ++ WRITE_ONCE(cpudata->perf, perf); + WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1)); +- WRITE_ONCE(cpudata->min_limit_perf, AMD_CPPC_LOWEST_PERF(cap1)); + return 0; + } + + static int shmem_init_perf(struct amd_cpudata *cpudata) + { + struct cppc_perf_caps cppc_perf; ++ union perf_cached perf = READ_ONCE(cpudata->perf); + u64 numerator; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); +@@ -459,14 +463,14 @@ static int shmem_init_perf(struct amd_cp + if (ret) + return ret; + +- WRITE_ONCE(cpudata->highest_perf, numerator); +- WRITE_ONCE(cpudata->max_limit_perf, numerator); +- WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); +- WRITE_ONCE(cpudata->lowest_nonlinear_perf, +- cppc_perf.lowest_nonlinear_perf); +- WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); ++ perf.highest_perf = numerator; ++ perf.max_limit_perf = numerator; ++ perf.min_limit_perf = cppc_perf.lowest_perf; ++ perf.nominal_perf = cppc_perf.nominal_perf; ++ perf.lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf; ++ perf.lowest_perf = cppc_perf.lowest_perf; ++ WRITE_ONCE(cpudata->perf, perf); + WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf); +- WRITE_ONCE(cpudata->min_limit_perf, cppc_perf.lowest_perf); + + if (cppc_state == AMD_PSTATE_ACTIVE) + return 0; +@@ -549,14 +553,14 @@ static void amd_pstate_update(struct amd + u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) + { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); +- u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); ++ union perf_cached perf = READ_ONCE(cpudata->perf); + + if (!policy) + return; + + des_perf = clamp_t(u8, des_perf, min_perf, max_perf); + +- policy->cur = perf_to_freq(cpudata, des_perf); ++ policy->cur = perf_to_freq(perf, cpudata->nominal_freq, des_perf); + + if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) { + min_perf = des_perf; +@@ -565,7 +569,7 @@ static void amd_pstate_update(struct amd + + /* limit the max perf when core performance boost feature is disabled */ + if (!cpudata->boost_supported) +- max_perf = min_t(u8, nominal_perf, max_perf); ++ max_perf = min_t(u8, perf.nominal_perf, max_perf); + + if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) { + trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq, +@@ -602,39 +606,41 @@ static int amd_pstate_verify(struct cpuf + return 0; + } + +-static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) ++static void amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) + { +- u8 max_limit_perf, min_limit_perf; + struct amd_cpudata *cpudata = policy->driver_data; ++ union perf_cached perf = READ_ONCE(cpudata->perf); + +- max_limit_perf = freq_to_perf(cpudata, policy->max); +- min_limit_perf = freq_to_perf(cpudata, policy->min); ++ perf.max_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->max); ++ perf.min_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->min); + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) +- min_limit_perf = min(cpudata->nominal_perf, max_limit_perf); ++ perf.min_limit_perf = min(perf.nominal_perf, perf.max_limit_perf); + +- WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); +- WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); + WRITE_ONCE(cpudata->max_limit_freq, policy->max); + WRITE_ONCE(cpudata->min_limit_freq, policy->min); +- +- return 0; ++ WRITE_ONCE(cpudata->perf, perf); + } + + static int amd_pstate_update_freq(struct cpufreq_policy *policy, + unsigned int target_freq, bool fast_switch) + { + struct cpufreq_freqs freqs; +- struct amd_cpudata *cpudata = policy->driver_data; ++ struct amd_cpudata *cpudata; ++ union perf_cached perf; + u8 des_perf; + ++ cpudata = policy->driver_data; ++ + if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) + amd_pstate_update_min_max_limit(policy); + ++ perf = READ_ONCE(cpudata->perf); ++ + freqs.old = policy->cur; + freqs.new = target_freq; + +- des_perf = freq_to_perf(cpudata, target_freq); ++ des_perf = freq_to_perf(perf, cpudata->nominal_freq, target_freq); + + WARN_ON(fast_switch && !policy->fast_switch_enabled); + /* +@@ -645,8 +651,8 @@ static int amd_pstate_update_freq(struct + if (!fast_switch) + cpufreq_freq_transition_begin(policy, &freqs); + +- amd_pstate_update(cpudata, cpudata->min_limit_perf, des_perf, +- cpudata->max_limit_perf, fast_switch, ++ amd_pstate_update(cpudata, perf.min_limit_perf, des_perf, ++ perf.max_limit_perf, fast_switch, + policy->governor->flags); + + if (!fast_switch) +@@ -675,9 +681,10 @@ static void amd_pstate_adjust_perf(unsig + unsigned long target_perf, + unsigned long capacity) + { +- u8 max_perf, min_perf, des_perf, cap_perf, min_limit_perf; ++ u8 max_perf, min_perf, des_perf, cap_perf; + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata; ++ union perf_cached perf; + + if (!policy) + return; +@@ -687,8 +694,8 @@ static void amd_pstate_adjust_perf(unsig + if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) + amd_pstate_update_min_max_limit(policy); + +- cap_perf = READ_ONCE(cpudata->highest_perf); +- min_limit_perf = READ_ONCE(cpudata->min_limit_perf); ++ perf = READ_ONCE(cpudata->perf); ++ cap_perf = perf.highest_perf; + + des_perf = cap_perf; + if (target_perf < capacity) +@@ -699,10 +706,10 @@ static void amd_pstate_adjust_perf(unsig + else + min_perf = cap_perf; + +- if (min_perf < min_limit_perf) +- min_perf = min_limit_perf; ++ if (min_perf < perf.min_limit_perf) ++ min_perf = perf.min_limit_perf; + +- max_perf = cpudata->max_limit_perf; ++ max_perf = perf.max_limit_perf; + if (max_perf < min_perf) + max_perf = min_perf; + +@@ -713,11 +720,12 @@ static void amd_pstate_adjust_perf(unsig + static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) + { + struct amd_cpudata *cpudata = policy->driver_data; ++ union perf_cached perf = READ_ONCE(cpudata->perf); + u32 nominal_freq, max_freq; + int ret = 0; + + nominal_freq = READ_ONCE(cpudata->nominal_freq); +- max_freq = perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf)); ++ max_freq = perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf); + + if (on) + policy->cpuinfo.max_freq = max_freq; +@@ -882,30 +890,30 @@ static u32 amd_pstate_get_transition_lat + } + + /* +- * amd_pstate_init_freq: Initialize the max_freq, min_freq, +- * nominal_freq and lowest_nonlinear_freq for +- * the @cpudata object. ++ * amd_pstate_init_freq: Initialize the nominal_freq and lowest_nonlinear_freq ++ * for the @cpudata object. + * +- * Requires: highest_perf, lowest_perf, nominal_perf and +- * lowest_nonlinear_perf members of @cpudata to be +- * initialized. ++ * Requires: all perf members of @cpudata to be initialized. + * +- * Returns 0 on success, non-zero value on failure. ++ * Returns 0 on success, non-zero value on failure. + */ + static int amd_pstate_init_freq(struct amd_cpudata *cpudata) + { +- int ret; +- u32 min_freq, max_freq; +- u32 nominal_freq, lowest_nonlinear_freq; ++ u32 min_freq, max_freq, nominal_freq, lowest_nonlinear_freq; + struct cppc_perf_caps cppc_perf; ++ union perf_cached perf; ++ int ret; + + ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; ++ perf = READ_ONCE(cpudata->perf); + +- if (quirks && quirks->lowest_freq) ++ if (quirks && quirks->lowest_freq) { + min_freq = quirks->lowest_freq; +- else ++ perf.lowest_perf = freq_to_perf(perf, nominal_freq, min_freq); ++ WRITE_ONCE(cpudata->perf, perf); ++ } else + min_freq = cppc_perf.lowest_freq; + + if (quirks && quirks->nominal_freq) +@@ -918,8 +926,8 @@ static int amd_pstate_init_freq(struct a + + WRITE_ONCE(cpudata->nominal_freq, nominal_freq); + +- max_freq = perf_to_freq(cpudata, cpudata->highest_perf); +- lowest_nonlinear_freq = perf_to_freq(cpudata, cpudata->lowest_nonlinear_perf); ++ max_freq = perf_to_freq(perf, nominal_freq, perf.highest_perf); ++ lowest_nonlinear_freq = perf_to_freq(perf, nominal_freq, perf.lowest_nonlinear_perf); + WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); + + /** +@@ -946,6 +954,7 @@ static int amd_pstate_init_freq(struct a + static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata; ++ union perf_cached perf; + struct device *dev; + int ret; + +@@ -981,8 +990,14 @@ static int amd_pstate_cpu_init(struct cp + policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu); + policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu); + +- policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf); +- policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf); ++ perf = READ_ONCE(cpudata->perf); ++ ++ policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf, ++ cpudata->nominal_freq, ++ perf.lowest_perf); ++ policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, ++ cpudata->nominal_freq, ++ perf.highest_perf); + + policy->boost_enabled = READ_ONCE(cpudata->boost_supported); + +@@ -1063,23 +1078,27 @@ static int amd_pstate_cpu_suspend(struct + static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, + char *buf) + { +- struct amd_cpudata *cpudata = policy->driver_data; ++ struct amd_cpudata *cpudata; ++ union perf_cached perf; + ++ cpudata = policy->driver_data; ++ perf = READ_ONCE(cpudata->perf); + +- return sysfs_emit(buf, "%u\n", perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf))); ++ return sysfs_emit(buf, "%u\n", ++ perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf)); + } + + static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, + char *buf) + { +- int freq; +- struct amd_cpudata *cpudata = policy->driver_data; ++ struct amd_cpudata *cpudata; ++ union perf_cached perf; + +- freq = READ_ONCE(cpudata->lowest_nonlinear_freq); +- if (freq < 0) +- return freq; ++ cpudata = policy->driver_data; ++ perf = READ_ONCE(cpudata->perf); + +- return sysfs_emit(buf, "%u\n", freq); ++ return sysfs_emit(buf, "%u\n", ++ perf_to_freq(perf, cpudata->nominal_freq, perf.lowest_nonlinear_perf)); + } + + /* +@@ -1089,12 +1108,11 @@ static ssize_t show_amd_pstate_lowest_no + static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, + char *buf) + { +- u8 perf; +- struct amd_cpudata *cpudata = policy->driver_data; ++ struct amd_cpudata *cpudata; + +- perf = READ_ONCE(cpudata->highest_perf); ++ cpudata = policy->driver_data; + +- return sysfs_emit(buf, "%u\n", perf); ++ return sysfs_emit(buf, "%u\n", cpudata->perf.highest_perf); + } + + static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy, +@@ -1425,6 +1443,7 @@ static bool amd_pstate_acpi_pm_profile_u + static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata; ++ union perf_cached perf; + struct device *dev; + u64 value; + int ret; +@@ -1458,8 +1477,15 @@ static int amd_pstate_epp_cpu_init(struc + if (ret) + goto free_cpudata1; + +- policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf); +- policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf); ++ perf = READ_ONCE(cpudata->perf); ++ ++ policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf, ++ cpudata->nominal_freq, ++ perf.lowest_perf); ++ policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, ++ cpudata->nominal_freq, ++ perf.highest_perf); ++ + /* It will be updated by governor */ + policy->cur = policy->cpuinfo.min_freq; + +@@ -1520,6 +1546,7 @@ static void amd_pstate_epp_cpu_exit(stru + static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; ++ union perf_cached perf; + u8 epp; + + if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) +@@ -1530,15 +1557,16 @@ static int amd_pstate_epp_update_limit(s + else + epp = READ_ONCE(cpudata->epp_cached); + ++ perf = READ_ONCE(cpudata->perf); + if (trace_amd_pstate_epp_perf_enabled()) { +- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp, +- cpudata->min_limit_perf, +- cpudata->max_limit_perf, ++ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, epp, ++ perf.min_limit_perf, ++ perf.max_limit_perf, + policy->boost_enabled); + } + +- return amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, +- cpudata->max_limit_perf, epp, false); ++ return amd_pstate_update_perf(cpudata, perf.min_limit_perf, 0U, ++ perf.max_limit_perf, epp, false); + } + + static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) +@@ -1570,20 +1598,18 @@ static int amd_pstate_epp_set_policy(str + static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- u8 max_perf; ++ union perf_cached perf = READ_ONCE(cpudata->perf); + int ret; + + ret = amd_pstate_cppc_enable(true); + if (ret) + pr_err("failed to enable amd pstate during resume, return %d\n", ret); + +- max_perf = READ_ONCE(cpudata->highest_perf); +- + if (trace_amd_pstate_epp_perf_enabled()) { +- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, ++ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, + cpudata->epp_cached, + FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), +- max_perf, policy->boost_enabled); ++ perf.highest_perf, policy->boost_enabled); + } + + return amd_pstate_epp_update_limit(policy); +@@ -1607,22 +1633,21 @@ static int amd_pstate_epp_cpu_online(str + static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- u8 min_perf; ++ union perf_cached perf = READ_ONCE(cpudata->perf); + + if (cpudata->suspended) + return 0; + +- min_perf = READ_ONCE(cpudata->lowest_perf); +- + guard(mutex)(&amd_pstate_limits_lock); + + if (trace_amd_pstate_epp_perf_enabled()) { +- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, ++ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, + AMD_CPPC_EPP_BALANCE_POWERSAVE, +- min_perf, min_perf, policy->boost_enabled); ++ perf.lowest_perf, perf.lowest_perf, ++ policy->boost_enabled); + } + +- return amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, ++ return amd_pstate_update_perf(cpudata, perf.lowest_perf, 0, perf.lowest_perf, + AMD_CPPC_EPP_BALANCE_POWERSAVE, false); + } + +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -13,6 +13,36 @@ + /********************************************************************* + * AMD P-state INTERFACE * + *********************************************************************/ ++ ++/** ++ * union perf_cached - A union to cache performance-related data. ++ * @highest_perf: the maximum performance an individual processor may reach, ++ * assuming ideal conditions ++ * For platforms that support the preferred core feature, the highest_perf value maybe ++ * configured to any value in the range 166-255 by the firmware (because the preferred ++ * core ranking is encoded in the highest_perf value). To maintain consistency across ++ * all platforms, we split the highest_perf and preferred core ranking values into ++ * cpudata->perf.highest_perf and cpudata->prefcore_ranking. ++ * @nominal_perf: the maximum sustained performance level of the processor, ++ * assuming ideal operating conditions ++ * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power ++ * savings are achieved ++ * @lowest_perf: the absolute lowest performance level of the processor ++ * @min_limit_perf: Cached value of the performance corresponding to policy->min ++ * @max_limit_perf: Cached value of the performance corresponding to policy->max ++ */ ++union perf_cached { ++ struct { ++ u8 highest_perf; ++ u8 nominal_perf; ++ u8 lowest_nonlinear_perf; ++ u8 lowest_perf; ++ u8 min_limit_perf; ++ u8 max_limit_perf; ++ }; ++ u64 val; ++}; ++ + /** + * struct amd_aperf_mperf + * @aperf: actual performance frequency clock count +@@ -30,20 +60,9 @@ struct amd_aperf_mperf { + * @cpu: CPU number + * @req: constraint request to apply + * @cppc_req_cached: cached performance request hints +- * @highest_perf: the maximum performance an individual processor may reach, +- * assuming ideal conditions +- * For platforms that do not support the preferred core feature, the +- * highest_pef may be configured with 166 or 255, to avoid max frequency +- * calculated wrongly. we take the fixed value as the highest_perf. +- * @nominal_perf: the maximum sustained performance level of the processor, +- * assuming ideal operating conditions +- * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power +- * savings are achieved +- * @lowest_perf: the absolute lowest performance level of the processor ++ * @perf: cached performance-related data + * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher + * priority. +- * @min_limit_perf: Cached value of the performance corresponding to policy->min +- * @max_limit_perf: Cached value of the performance corresponding to policy->max + * @min_limit_freq: Cached value of policy->min (in khz) + * @max_limit_freq: Cached value of policy->max (in khz) + * @nominal_freq: the frequency (in khz) that mapped to nominal_perf +@@ -68,13 +87,9 @@ struct amd_cpudata { + struct freq_qos_request req[2]; + u64 cppc_req_cached; + +- u8 highest_perf; +- u8 nominal_perf; +- u8 lowest_nonlinear_perf; +- u8 lowest_perf; ++ union perf_cached perf; ++ + u8 prefcore_ranking; +- u8 min_limit_perf; +- u8 max_limit_perf; + u32 min_limit_freq; + u32 max_limit_freq; + u32 nominal_freq; diff --git a/debian/patches/patchset-pf/amd-pstate/0015-cpufreq-amd-pstate-Overhaul-locking.patch b/debian/patches/patchset-pf/amd-pstate/0015-cpufreq-amd-pstate-Overhaul-locking.patch new file mode 100644 index 0000000..9fcd898 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0015-cpufreq-amd-pstate-Overhaul-locking.patch @@ -0,0 +1,89 @@ +From 6c0b59640cce68d7574078d7d1e549bdb8f0128d Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:20 -0600 +Subject: cpufreq/amd-pstate: Overhaul locking + +amd_pstate_cpu_boost_update() and refresh_frequency_limits() both +update the policy state and have nothing to do with the amd-pstate +driver itself. + +A global "limits" lock doesn't make sense because each CPU can have +policies changed independently. Each time a CPU changes values they +will atomically be written to the per-CPU perf member. Drop per CPU +locking cases. + +The remaining "global" driver lock is used to ensure that only one +entity can change driver modes at a given time. + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 13 +++---------- + 1 file changed, 3 insertions(+), 10 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -196,7 +196,6 @@ static inline int get_mode_idx_from_str( + return -EINVAL; + } + +-static DEFINE_MUTEX(amd_pstate_limits_lock); + static DEFINE_MUTEX(amd_pstate_driver_lock); + + static u8 msr_get_epp(struct amd_cpudata *cpudata) +@@ -752,7 +751,6 @@ static int amd_pstate_set_boost(struct c + pr_err("Boost mode is not supported by this processor or SBIOS\n"); + return -EOPNOTSUPP; + } +- guard(mutex)(&amd_pstate_driver_lock); + + ret = amd_pstate_cpu_boost_update(policy, state); + refresh_frequency_limits(policy); +@@ -1170,8 +1168,6 @@ static ssize_t store_energy_performance_ + if (ret < 0) + return -EINVAL; + +- guard(mutex)(&amd_pstate_limits_lock); +- + ret = amd_pstate_set_energy_pref_index(policy, ret); + + return ret ? ret : count; +@@ -1344,8 +1340,10 @@ int amd_pstate_update_status(const char + if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX) + return -EINVAL; + +- if (mode_state_machine[cppc_state][mode_idx]) ++ if (mode_state_machine[cppc_state][mode_idx]) { ++ guard(mutex)(&amd_pstate_driver_lock); + return mode_state_machine[cppc_state][mode_idx](mode_idx); ++ } + + return 0; + } +@@ -1366,7 +1364,6 @@ static ssize_t status_store(struct devic + char *p = memchr(buf, '\n', count); + int ret; + +- guard(mutex)(&amd_pstate_driver_lock); + ret = amd_pstate_update_status(buf, p ? p - buf : count); + + return ret < 0 ? ret : count; +@@ -1638,8 +1635,6 @@ static int amd_pstate_epp_cpu_offline(st + if (cpudata->suspended) + return 0; + +- guard(mutex)(&amd_pstate_limits_lock); +- + if (trace_amd_pstate_epp_perf_enabled()) { + trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, + AMD_CPPC_EPP_BALANCE_POWERSAVE, +@@ -1679,8 +1674,6 @@ static int amd_pstate_epp_resume(struct + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata->suspended) { +- guard(mutex)(&amd_pstate_limits_lock); +- + /* enable amd pstate from suspend state*/ + amd_pstate_epp_reenable(policy); + diff --git a/debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Drop-cppc_cap1_cached.patch b/debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Drop-cppc_cap1_cached.patch new file mode 100644 index 0000000..b41b670 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Drop-cppc_cap1_cached.patch @@ -0,0 +1,48 @@ +From 7c9409faeb921c76988b4cd2294ca0a959775f35 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:21 -0600 +Subject: cpufreq/amd-pstate: Drop `cppc_cap1_cached` + +The `cppc_cap1_cached` variable isn't used at all, there is no +need to read it at initialization for each CPU. + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 5 ----- + drivers/cpufreq/amd-pstate.h | 2 -- + 2 files changed, 7 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1508,11 +1508,6 @@ static int amd_pstate_epp_cpu_init(struc + if (ret) + return ret; + WRITE_ONCE(cpudata->cppc_req_cached, value); +- +- ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value); +- if (ret) +- return ret; +- WRITE_ONCE(cpudata->cppc_cap1_cached, value); + } + ret = amd_pstate_set_epp(cpudata, cpudata->epp_default); + if (ret) +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -76,7 +76,6 @@ struct amd_aperf_mperf { + * AMD P-State driver supports preferred core featue. + * @epp_cached: Cached CPPC energy-performance preference value + * @policy: Cpufreq policy value +- * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value + * + * The amd_cpudata is key private data for each CPU thread in AMD P-State, and + * represents all the attributes and goals that AMD P-State requests at runtime. +@@ -105,7 +104,6 @@ struct amd_cpudata { + /* EPP feature related attributes*/ + u8 epp_cached; + u32 policy; +- u64 cppc_cap1_cached; + bool suspended; + u8 epp_default; + }; diff --git a/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Use-_free-macro-to-free-put-po.patch b/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Use-_free-macro-to-free-put-po.patch new file mode 100644 index 0000000..4af0d8e --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Use-_free-macro-to-free-put-po.patch @@ -0,0 +1,144 @@ +From 346b2824b742a8f5943db8c8200ba4a7492bb3cf Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:22 -0600 +Subject: cpufreq/amd-pstate-ut: Use _free macro to free put policy + +Using a scoped cleanup macro simplifies cleanup code. + +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate-ut.c | 33 ++++++++++++++------------------- + 1 file changed, 14 insertions(+), 19 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -26,6 +26,7 @@ + #include <linux/module.h> + #include <linux/moduleparam.h> + #include <linux/fs.h> ++#include <linux/cleanup.h> + + #include <acpi/cppc_acpi.h> + +@@ -127,11 +128,12 @@ static void amd_pstate_ut_check_perf(u32 + u32 highest_perf = 0, nominal_perf = 0, lowest_nonlinear_perf = 0, lowest_perf = 0; + u64 cap1 = 0; + struct cppc_perf_caps cppc_perf; +- struct cpufreq_policy *policy = NULL; + struct amd_cpudata *cpudata = NULL; + union perf_cached cur_perf; + + for_each_possible_cpu(cpu) { ++ struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; ++ + policy = cpufreq_cpu_get(cpu); + if (!policy) + break; +@@ -142,7 +144,7 @@ static void amd_pstate_ut_check_perf(u32 + if (ret) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cppc_get_perf_caps ret=%d error!\n", __func__, ret); +- goto skip_test; ++ return; + } + + highest_perf = cppc_perf.highest_perf; +@@ -154,7 +156,7 @@ static void amd_pstate_ut_check_perf(u32 + if (ret) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s read CPPC_CAP1 ret=%d error!\n", __func__, ret); +- goto skip_test; ++ return; + } + + highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); +@@ -167,7 +169,7 @@ static void amd_pstate_ut_check_perf(u32 + if (highest_perf != cur_perf.highest_perf && !cpudata->hw_prefcore) { + pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n", + __func__, cpu, highest_perf, cur_perf.highest_perf); +- goto skip_test; ++ return; + } + if (nominal_perf != cur_perf.nominal_perf || + (lowest_nonlinear_perf != cur_perf.lowest_nonlinear_perf) || +@@ -177,7 +179,7 @@ static void amd_pstate_ut_check_perf(u32 + __func__, cpu, nominal_perf, cur_perf.nominal_perf, + lowest_nonlinear_perf, cur_perf.lowest_nonlinear_perf, + lowest_perf, cur_perf.lowest_perf); +- goto skip_test; ++ return; + } + + if (!((highest_perf >= nominal_perf) && +@@ -188,15 +190,11 @@ static void amd_pstate_ut_check_perf(u32 + pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n", + __func__, cpu, highest_perf, nominal_perf, + lowest_nonlinear_perf, lowest_perf); +- goto skip_test; ++ return; + } +- cpufreq_cpu_put(policy); + } + + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; +- return; +-skip_test: +- cpufreq_cpu_put(policy); + } + + /* +@@ -207,10 +205,11 @@ skip_test: + static void amd_pstate_ut_check_freq(u32 index) + { + int cpu = 0; +- struct cpufreq_policy *policy = NULL; + struct amd_cpudata *cpudata = NULL; + + for_each_possible_cpu(cpu) { ++ struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; ++ + policy = cpufreq_cpu_get(cpu); + if (!policy) + break; +@@ -224,14 +223,14 @@ static void amd_pstate_ut_check_freq(u32 + pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", + __func__, cpu, policy->cpuinfo.max_freq, cpudata->nominal_freq, + cpudata->lowest_nonlinear_freq, policy->cpuinfo.min_freq); +- goto skip_test; ++ return; + } + + if (cpudata->lowest_nonlinear_freq != policy->min) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n", + __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min); +- goto skip_test; ++ return; + } + + if (cpudata->boost_supported) { +@@ -243,20 +242,16 @@ static void amd_pstate_ut_check_freq(u32 + pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", + __func__, cpu, policy->max, policy->cpuinfo.max_freq, + cpudata->nominal_freq); +- goto skip_test; ++ return; + } + } else { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d must support boost!\n", __func__, cpu); +- goto skip_test; ++ return; + } +- cpufreq_cpu_put(policy); + } + + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; +- return; +-skip_test: +- cpufreq_cpu_put(policy); + } + + static int amd_pstate_set_mode(enum amd_pstate_mode mode) diff --git a/debian/patches/patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-ut-Allow-lowest-nonlinear-and-low.patch b/debian/patches/patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-ut-Allow-lowest-nonlinear-and-low.patch new file mode 100644 index 0000000..e3f4afb --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-ut-Allow-lowest-nonlinear-and-low.patch @@ -0,0 +1,37 @@ +From 310f8a994f55561902e5a75ff8623988921e3908 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:23 -0600 +Subject: cpufreq/amd-pstate-ut: Allow lowest nonlinear and lowest to be the + same + +Several Ryzen AI processors support the exact same value for lowest +nonlinear perf and lowest perf. Loosen up the unit tests to allow this +scenario. + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate-ut.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -184,7 +184,7 @@ static void amd_pstate_ut_check_perf(u32 + + if (!((highest_perf >= nominal_perf) && + (nominal_perf > lowest_nonlinear_perf) && +- (lowest_nonlinear_perf > lowest_perf) && ++ (lowest_nonlinear_perf >= lowest_perf) && + (lowest_perf > 0))) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n", +@@ -217,7 +217,7 @@ static void amd_pstate_ut_check_freq(u32 + + if (!((policy->cpuinfo.max_freq >= cpudata->nominal_freq) && + (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) && +- (cpudata->lowest_nonlinear_freq > policy->cpuinfo.min_freq) && ++ (cpudata->lowest_nonlinear_freq >= policy->cpuinfo.min_freq) && + (policy->cpuinfo.min_freq > 0))) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", diff --git a/debian/patches/patchset-pf/amd-pstate/0019-cpufreq-amd-pstate-ut-Drop-SUCCESS-and-FAIL-enums.patch b/debian/patches/patchset-pf/amd-pstate/0019-cpufreq-amd-pstate-ut-Drop-SUCCESS-and-FAIL-enums.patch new file mode 100644 index 0000000..264b38d --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0019-cpufreq-amd-pstate-ut-Drop-SUCCESS-and-FAIL-enums.patch @@ -0,0 +1,309 @@ +From bc4a683dbfcc306851bbfec33f9c857c523d4848 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:24 -0600 +Subject: cpufreq/amd-pstate-ut: Drop SUCCESS and FAIL enums + +Enums are effectively used as a boolean and don't show +the return value of the failing call. + +Instead of using enums switch to returning the actual return +code from the unit test. + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate-ut.c | 143 ++++++++++++-------------------- + 1 file changed, 55 insertions(+), 88 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -32,30 +32,20 @@ + + #include "amd-pstate.h" + +-/* +- * Abbreviations: +- * amd_pstate_ut: used as a shortform for AMD P-State unit test. +- * It helps to keep variable names smaller, simpler +- */ +-enum amd_pstate_ut_result { +- AMD_PSTATE_UT_RESULT_PASS, +- AMD_PSTATE_UT_RESULT_FAIL, +-}; + + struct amd_pstate_ut_struct { + const char *name; +- void (*func)(u32 index); +- enum amd_pstate_ut_result result; ++ int (*func)(u32 index); + }; + + /* + * Kernel module for testing the AMD P-State unit test + */ +-static void amd_pstate_ut_acpi_cpc_valid(u32 index); +-static void amd_pstate_ut_check_enabled(u32 index); +-static void amd_pstate_ut_check_perf(u32 index); +-static void amd_pstate_ut_check_freq(u32 index); +-static void amd_pstate_ut_check_driver(u32 index); ++static int amd_pstate_ut_acpi_cpc_valid(u32 index); ++static int amd_pstate_ut_check_enabled(u32 index); ++static int amd_pstate_ut_check_perf(u32 index); ++static int amd_pstate_ut_check_freq(u32 index); ++static int amd_pstate_ut_check_driver(u32 index); + + static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { + {"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid }, +@@ -78,51 +68,46 @@ static bool get_shared_mem(void) + /* + * check the _CPC object is present in SBIOS. + */ +-static void amd_pstate_ut_acpi_cpc_valid(u32 index) ++static int amd_pstate_ut_acpi_cpc_valid(u32 index) + { +- if (acpi_cpc_valid()) +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; +- else { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; ++ if (!acpi_cpc_valid()) { + pr_err("%s the _CPC object is not present in SBIOS!\n", __func__); ++ return -EINVAL; + } ++ ++ return 0; + } + +-static void amd_pstate_ut_pstate_enable(u32 index) ++/* ++ * check if amd pstate is enabled ++ */ ++static int amd_pstate_ut_check_enabled(u32 index) + { +- int ret = 0; + u64 cppc_enable = 0; ++ int ret; ++ ++ if (get_shared_mem()) ++ return 0; + + ret = rdmsrl_safe(MSR_AMD_CPPC_ENABLE, &cppc_enable); + if (ret) { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s rdmsrl_safe MSR_AMD_CPPC_ENABLE ret=%d error!\n", __func__, ret); +- return; ++ return ret; + } +- if (cppc_enable) +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; +- else { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; ++ ++ if (!cppc_enable) { + pr_err("%s amd pstate must be enabled!\n", __func__); ++ return -EINVAL; + } +-} + +-/* +- * check if amd pstate is enabled +- */ +-static void amd_pstate_ut_check_enabled(u32 index) +-{ +- if (get_shared_mem()) +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; +- else +- amd_pstate_ut_pstate_enable(index); ++ return 0; + } + + /* + * check if performance values are reasonable. + * highest_perf >= nominal_perf > lowest_nonlinear_perf > lowest_perf > 0 + */ +-static void amd_pstate_ut_check_perf(u32 index) ++static int amd_pstate_ut_check_perf(u32 index) + { + int cpu = 0, ret = 0; + u32 highest_perf = 0, nominal_perf = 0, lowest_nonlinear_perf = 0, lowest_perf = 0; +@@ -142,9 +127,8 @@ static void amd_pstate_ut_check_perf(u32 + if (get_shared_mem()) { + ret = cppc_get_perf_caps(cpu, &cppc_perf); + if (ret) { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cppc_get_perf_caps ret=%d error!\n", __func__, ret); +- return; ++ return ret; + } + + highest_perf = cppc_perf.highest_perf; +@@ -154,9 +138,8 @@ static void amd_pstate_ut_check_perf(u32 + } else { + ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); + if (ret) { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s read CPPC_CAP1 ret=%d error!\n", __func__, ret); +- return; ++ return ret; + } + + highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); +@@ -169,32 +152,30 @@ static void amd_pstate_ut_check_perf(u32 + if (highest_perf != cur_perf.highest_perf && !cpudata->hw_prefcore) { + pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n", + __func__, cpu, highest_perf, cur_perf.highest_perf); +- return; ++ return -EINVAL; + } + if (nominal_perf != cur_perf.nominal_perf || + (lowest_nonlinear_perf != cur_perf.lowest_nonlinear_perf) || + (lowest_perf != cur_perf.lowest_perf)) { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d nominal=%d %d lowest_nonlinear=%d %d lowest=%d %d, they should be equal!\n", + __func__, cpu, nominal_perf, cur_perf.nominal_perf, + lowest_nonlinear_perf, cur_perf.lowest_nonlinear_perf, + lowest_perf, cur_perf.lowest_perf); +- return; ++ return -EINVAL; + } + + if (!((highest_perf >= nominal_perf) && + (nominal_perf > lowest_nonlinear_perf) && + (lowest_nonlinear_perf >= lowest_perf) && + (lowest_perf > 0))) { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n", + __func__, cpu, highest_perf, nominal_perf, + lowest_nonlinear_perf, lowest_perf); +- return; ++ return -EINVAL; + } + } + +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; ++ return 0; + } + + /* +@@ -202,7 +183,7 @@ static void amd_pstate_ut_check_perf(u32 + * max_freq >= nominal_freq > lowest_nonlinear_freq > min_freq > 0 + * check max freq when set support boost mode. + */ +-static void amd_pstate_ut_check_freq(u32 index) ++static int amd_pstate_ut_check_freq(u32 index) + { + int cpu = 0; + struct amd_cpudata *cpudata = NULL; +@@ -219,39 +200,33 @@ static void amd_pstate_ut_check_freq(u32 + (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) && + (cpudata->lowest_nonlinear_freq >= policy->cpuinfo.min_freq) && + (policy->cpuinfo.min_freq > 0))) { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", + __func__, cpu, policy->cpuinfo.max_freq, cpudata->nominal_freq, + cpudata->lowest_nonlinear_freq, policy->cpuinfo.min_freq); +- return; ++ return -EINVAL; + } + + if (cpudata->lowest_nonlinear_freq != policy->min) { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n", + __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min); +- return; ++ return -EINVAL; + } + + if (cpudata->boost_supported) { +- if ((policy->max == policy->cpuinfo.max_freq) || +- (policy->max == cpudata->nominal_freq)) +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; +- else { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; ++ if ((policy->max != policy->cpuinfo.max_freq) && ++ (policy->max != cpudata->nominal_freq)) { + pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", + __func__, cpu, policy->max, policy->cpuinfo.max_freq, + cpudata->nominal_freq); +- return; ++ return -EINVAL; + } + } else { +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d must support boost!\n", __func__, cpu); +- return; ++ return -EINVAL; + } + } + +- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; ++ return 0; + } + + static int amd_pstate_set_mode(enum amd_pstate_mode mode) +@@ -263,32 +238,28 @@ static int amd_pstate_set_mode(enum amd_ + return amd_pstate_update_status(mode_str, strlen(mode_str)); + } + +-static void amd_pstate_ut_check_driver(u32 index) ++static int amd_pstate_ut_check_driver(u32 index) + { + enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE; +- int ret; + + for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) { +- ret = amd_pstate_set_mode(mode1); ++ int ret = amd_pstate_set_mode(mode1); + if (ret) +- goto out; ++ return ret; + for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) { + if (mode1 == mode2) + continue; + ret = amd_pstate_set_mode(mode2); +- if (ret) +- goto out; ++ if (ret) { ++ pr_err("%s: failed to update status for %s->%s\n", __func__, ++ amd_pstate_get_mode_string(mode1), ++ amd_pstate_get_mode_string(mode2)); ++ return ret; ++ } + } + } +-out: +- if (ret) +- pr_warn("%s: failed to update status for %s->%s: %d\n", __func__, +- amd_pstate_get_mode_string(mode1), +- amd_pstate_get_mode_string(mode2), ret); +- +- amd_pstate_ut_cases[index].result = ret ? +- AMD_PSTATE_UT_RESULT_FAIL : +- AMD_PSTATE_UT_RESULT_PASS; ++ ++ return 0; + } + + static int __init amd_pstate_ut_init(void) +@@ -296,16 +267,12 @@ static int __init amd_pstate_ut_init(voi + u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases); + + for (i = 0; i < arr_size; i++) { +- amd_pstate_ut_cases[i].func(i); +- switch (amd_pstate_ut_cases[i].result) { +- case AMD_PSTATE_UT_RESULT_PASS: ++ int ret = amd_pstate_ut_cases[i].func(i); ++ ++ if (ret) ++ pr_err("%-4d %-20s\t fail: %d!\n", i+1, amd_pstate_ut_cases[i].name, ret); ++ else + pr_info("%-4d %-20s\t success!\n", i+1, amd_pstate_ut_cases[i].name); +- break; +- case AMD_PSTATE_UT_RESULT_FAIL: +- default: +- pr_info("%-4d %-20s\t fail!\n", i+1, amd_pstate_ut_cases[i].name); +- break; +- } + } + + return 0; diff --git a/debian/patches/patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-ut-Run-on-all-of-the-correct-CPUs.patch b/debian/patches/patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-ut-Run-on-all-of-the-correct-CPUs.patch new file mode 100644 index 0000000..37eaa37 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-ut-Run-on-all-of-the-correct-CPUs.patch @@ -0,0 +1,50 @@ +From 3651a3bd2d07f627d5382ec9e9b980c689d0eb98 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:25 -0600 +Subject: cpufreq/amd-pstate-ut: Run on all of the correct CPUs + +If a CPU is missing a policy or one has been offlined then the unit test +is skipped for the rest of the CPUs on the system. + +Instead; iterate online CPUs and skip any missing policies to allow +continuing to test the rest of them. + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate-ut.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -116,12 +116,12 @@ static int amd_pstate_ut_check_perf(u32 + struct amd_cpudata *cpudata = NULL; + union perf_cached cur_perf; + +- for_each_possible_cpu(cpu) { ++ for_each_online_cpu(cpu) { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + + policy = cpufreq_cpu_get(cpu); + if (!policy) +- break; ++ continue; + cpudata = policy->driver_data; + + if (get_shared_mem()) { +@@ -188,12 +188,12 @@ static int amd_pstate_ut_check_freq(u32 + int cpu = 0; + struct amd_cpudata *cpudata = NULL; + +- for_each_possible_cpu(cpu) { ++ for_each_online_cpu(cpu) { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + + policy = cpufreq_cpu_get(cpu); + if (!policy) +- break; ++ continue; + cpudata = policy->driver_data; + + if (!((policy->cpuinfo.max_freq >= cpudata->nominal_freq) && diff --git a/debian/patches/patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-ut-Adjust-variable-scope.patch b/debian/patches/patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-ut-Adjust-variable-scope.patch new file mode 100644 index 0000000..4c5f051 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-ut-Adjust-variable-scope.patch @@ -0,0 +1,42 @@ +From 4ec612c9d5de9620b8f0ad4463db5d08c2d68222 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:26 -0600 +Subject: cpufreq/amd-pstate-ut: Adjust variable scope + +In amd_pstate_ut_check_freq() and amd_pstate_ut_check_perf() the cpudata +variable is only needed in the scope of the for loop. Move it there. + +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate-ut.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -113,11 +113,11 @@ static int amd_pstate_ut_check_perf(u32 + u32 highest_perf = 0, nominal_perf = 0, lowest_nonlinear_perf = 0, lowest_perf = 0; + u64 cap1 = 0; + struct cppc_perf_caps cppc_perf; +- struct amd_cpudata *cpudata = NULL; + union perf_cached cur_perf; + + for_each_online_cpu(cpu) { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; ++ struct amd_cpudata *cpudata; + + policy = cpufreq_cpu_get(cpu); + if (!policy) +@@ -186,10 +186,10 @@ static int amd_pstate_ut_check_perf(u32 + static int amd_pstate_ut_check_freq(u32 index) + { + int cpu = 0; +- struct amd_cpudata *cpudata = NULL; + + for_each_online_cpu(cpu) { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; ++ struct amd_cpudata *cpudata; + + policy = cpufreq_cpu_get(cpu); + if (!policy) diff --git a/debian/patches/patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Replace-all-AMD_CPPC_-macros-with.patch b/debian/patches/patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Replace-all-AMD_CPPC_-macros-with.patch new file mode 100644 index 0000000..7af21f6 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Replace-all-AMD_CPPC_-macros-with.patch @@ -0,0 +1,123 @@ +From 1512ed2a741a0df98972679da6177df4998fd8ce Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:27 -0600 +Subject: cpufreq/amd-pstate: Replace all AMD_CPPC_* macros with masks + +Bitfield masks are easier to follow and less error prone. + +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + arch/x86/include/asm/msr-index.h | 20 +++++++++++--------- + arch/x86/kernel/acpi/cppc.c | 4 +++- + drivers/cpufreq/amd-pstate-ut.c | 9 +++++---- + drivers/cpufreq/amd-pstate.c | 16 ++++++---------- + 4 files changed, 25 insertions(+), 24 deletions(-) + +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -701,15 +701,17 @@ + #define MSR_AMD_CPPC_REQ 0xc00102b3 + #define MSR_AMD_CPPC_STATUS 0xc00102b4 + +-#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff) +-#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff) +-#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff) +-#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff) ++/* Masks for use with MSR_AMD_CPPC_CAP1 */ ++#define AMD_CPPC_LOWEST_PERF_MASK GENMASK(7, 0) ++#define AMD_CPPC_LOWNONLIN_PERF_MASK GENMASK(15, 8) ++#define AMD_CPPC_NOMINAL_PERF_MASK GENMASK(23, 16) ++#define AMD_CPPC_HIGHEST_PERF_MASK GENMASK(31, 24) + +-#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0) +-#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8) +-#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) +-#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) ++/* Masks for use with MSR_AMD_CPPC_REQ */ ++#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0) ++#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8) ++#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16) ++#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24) + + /* AMD Performance Counter Global Status and Control MSRs */ + #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 +--- a/arch/x86/kernel/acpi/cppc.c ++++ b/arch/x86/kernel/acpi/cppc.c +@@ -4,6 +4,8 @@ + * Copyright (c) 2016, Intel Corporation. + */ + ++#include <linux/bitfield.h> ++ + #include <acpi/cppc_acpi.h> + #include <asm/msr.h> + #include <asm/processor.h> +@@ -149,7 +151,7 @@ int amd_get_highest_perf(unsigned int cp + if (ret) + goto out; + +- val = AMD_CPPC_HIGHEST_PERF(val); ++ val = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, val); + } else { + ret = cppc_get_highest_perf(cpu, &val); + if (ret) +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -22,6 +22,7 @@ + + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + ++#include <linux/bitfield.h> + #include <linux/kernel.h> + #include <linux/module.h> + #include <linux/moduleparam.h> +@@ -142,10 +143,10 @@ static int amd_pstate_ut_check_perf(u32 + return ret; + } + +- highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); +- nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1); +- lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1); +- lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); ++ highest_perf = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, cap1); ++ nominal_perf = FIELD_GET(AMD_CPPC_NOMINAL_PERF_MASK, cap1); ++ lowest_nonlinear_perf = FIELD_GET(AMD_CPPC_LOWNONLIN_PERF_MASK, cap1); ++ lowest_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); + } + + cur_perf = READ_ONCE(cpudata->perf); +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -89,11 +89,6 @@ static bool cppc_enabled; + static bool amd_pstate_prefcore = true; + static struct quirk_entry *quirks; + +-#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0) +-#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8) +-#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16) +-#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24) +- + /* + * AMD Energy Preference Performance (EPP) + * The EPP is used in the CCLK DPM controller to drive +@@ -439,12 +434,13 @@ static int msr_init_perf(struct amd_cpud + + perf.highest_perf = numerator; + perf.max_limit_perf = numerator; +- perf.min_limit_perf = AMD_CPPC_LOWEST_PERF(cap1); +- perf.nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1); +- perf.lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1); +- perf.lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); ++ perf.min_limit_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); ++ perf.nominal_perf = FIELD_GET(AMD_CPPC_NOMINAL_PERF_MASK, cap1); ++ perf.lowest_nonlinear_perf = FIELD_GET(AMD_CPPC_LOWNONLIN_PERF_MASK, cap1); ++ perf.lowest_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); + WRITE_ONCE(cpudata->perf, perf); +- WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1)); ++ WRITE_ONCE(cpudata->prefcore_ranking, FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, cap1)); ++ + return 0; + } + diff --git a/debian/patches/patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Cache-CPPC-request-in-shared-mem-.patch b/debian/patches/patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Cache-CPPC-request-in-shared-mem-.patch new file mode 100644 index 0000000..505d80a --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Cache-CPPC-request-in-shared-mem-.patch @@ -0,0 +1,60 @@ +From bf6e8073cc7f17d6be40e16a04b5a277d7217f39 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:28 -0600 +Subject: cpufreq/amd-pstate: Cache CPPC request in shared mem case too + +In order to prevent a potential write for shmem_update_perf() +cache the request into the cppc_req_cached variable normally only +used for the MSR case. + +This adds symmetry into the code and potentially avoids extra writes. + +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 22 +++++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -496,6 +496,8 @@ static int shmem_update_perf(struct amd_ + u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) + { + struct cppc_perf_ctrls perf_ctrls; ++ u64 value, prev; ++ int ret; + + if (cppc_state == AMD_PSTATE_ACTIVE) { + int ret = shmem_set_epp(cpudata, epp); +@@ -504,11 +506,29 @@ static int shmem_update_perf(struct amd_ + return ret; + } + ++ value = prev = READ_ONCE(cpudata->cppc_req_cached); ++ ++ value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | ++ AMD_CPPC_DES_PERF_MASK | AMD_CPPC_EPP_PERF_MASK); ++ value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf); ++ value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, des_perf); ++ value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); ++ value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); ++ ++ if (value == prev) ++ return 0; ++ + perf_ctrls.max_perf = max_perf; + perf_ctrls.min_perf = min_perf; + perf_ctrls.desired_perf = des_perf; + +- return cppc_set_perf(cpudata->cpu, &perf_ctrls); ++ ret = cppc_set_perf(cpudata->cpu, &perf_ctrls); ++ if (ret) ++ return ret; ++ ++ WRITE_ONCE(cpudata->cppc_req_cached, value); ++ ++ return 0; + } + + static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) diff --git a/debian/patches/patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Move-all-EPP-tracing-into-_update.patch b/debian/patches/patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Move-all-EPP-tracing-into-_update.patch new file mode 100644 index 0000000..24a45aa --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Move-all-EPP-tracing-into-_update.patch @@ -0,0 +1,318 @@ +From 1a3ff33ff2fbe3ecc2d86addd115329fddb28ea1 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:29 -0600 +Subject: cpufreq/amd-pstate: Move all EPP tracing into *_update_perf and + *_set_epp functions + +The EPP tracing is done by the caller today, but this precludes the +information about whether the CPPC request has changed. + +Move it into the update_perf and set_epp functions and include information +about whether the request has changed from the last one. +amd_pstate_update_perf() and amd_pstate_set_epp() now require the policy +as an argument instead of the cpudata. + +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate-trace.h | 13 +++- + drivers/cpufreq/amd-pstate.c | 118 +++++++++++++++++------------ + 2 files changed, 80 insertions(+), 51 deletions(-) + +--- a/drivers/cpufreq/amd-pstate-trace.h ++++ b/drivers/cpufreq/amd-pstate-trace.h +@@ -90,7 +90,8 @@ TRACE_EVENT(amd_pstate_epp_perf, + u8 epp, + u8 min_perf, + u8 max_perf, +- bool boost ++ bool boost, ++ bool changed + ), + + TP_ARGS(cpu_id, +@@ -98,7 +99,8 @@ TRACE_EVENT(amd_pstate_epp_perf, + epp, + min_perf, + max_perf, +- boost), ++ boost, ++ changed), + + TP_STRUCT__entry( + __field(unsigned int, cpu_id) +@@ -107,6 +109,7 @@ TRACE_EVENT(amd_pstate_epp_perf, + __field(u8, min_perf) + __field(u8, max_perf) + __field(bool, boost) ++ __field(bool, changed) + ), + + TP_fast_assign( +@@ -116,15 +119,17 @@ TRACE_EVENT(amd_pstate_epp_perf, + __entry->min_perf = min_perf; + __entry->max_perf = max_perf; + __entry->boost = boost; ++ __entry->changed = changed; + ), + +- TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u", ++ TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u, changed=%u", + (unsigned int)__entry->cpu_id, + (u8)__entry->min_perf, + (u8)__entry->max_perf, + (u8)__entry->highest_perf, + (u8)__entry->epp, +- (bool)__entry->boost ++ (bool)__entry->boost, ++ (bool)__entry->changed + ) + ); + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -228,9 +228,10 @@ static u8 shmem_get_epp(struct amd_cpuda + return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, epp); + } + +-static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf, ++static int msr_update_perf(struct cpufreq_policy *policy, u8 min_perf, + u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) + { ++ struct amd_cpudata *cpudata = policy->driver_data; + u64 value, prev; + + value = prev = READ_ONCE(cpudata->cppc_req_cached); +@@ -242,6 +243,18 @@ static int msr_update_perf(struct amd_cp + value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + ++ if (trace_amd_pstate_epp_perf_enabled()) { ++ union perf_cached perf = READ_ONCE(cpudata->perf); ++ ++ trace_amd_pstate_epp_perf(cpudata->cpu, ++ perf.highest_perf, ++ epp, ++ min_perf, ++ max_perf, ++ policy->boost_enabled, ++ value != prev); ++ } ++ + if (value == prev) + return 0; + +@@ -256,24 +269,26 @@ static int msr_update_perf(struct amd_cp + } + + WRITE_ONCE(cpudata->cppc_req_cached, value); +- WRITE_ONCE(cpudata->epp_cached, epp); ++ if (epp != cpudata->epp_cached) ++ WRITE_ONCE(cpudata->epp_cached, epp); + + return 0; + } + + DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); + +-static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata, ++static inline int amd_pstate_update_perf(struct cpufreq_policy *policy, + u8 min_perf, u8 des_perf, + u8 max_perf, u8 epp, + bool fast_switch) + { +- return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, ++ return static_call(amd_pstate_update_perf)(policy, min_perf, des_perf, + max_perf, epp, fast_switch); + } + +-static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) ++static int msr_set_epp(struct cpufreq_policy *policy, u8 epp) + { ++ struct amd_cpudata *cpudata = policy->driver_data; + u64 value, prev; + int ret; + +@@ -281,6 +296,19 @@ static int msr_set_epp(struct amd_cpudat + value &= ~AMD_CPPC_EPP_PERF_MASK; + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + ++ if (trace_amd_pstate_epp_perf_enabled()) { ++ union perf_cached perf = cpudata->perf; ++ ++ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, ++ epp, ++ FIELD_GET(AMD_CPPC_MIN_PERF_MASK, ++ cpudata->cppc_req_cached), ++ FIELD_GET(AMD_CPPC_MAX_PERF_MASK, ++ cpudata->cppc_req_cached), ++ policy->boost_enabled, ++ value != prev); ++ } ++ + if (value == prev) + return 0; + +@@ -299,15 +327,29 @@ static int msr_set_epp(struct amd_cpudat + + DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp); + +-static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u8 epp) ++static inline int amd_pstate_set_epp(struct cpufreq_policy *policy, u8 epp) + { +- return static_call(amd_pstate_set_epp)(cpudata, epp); ++ return static_call(amd_pstate_set_epp)(policy, epp); + } + +-static int shmem_set_epp(struct amd_cpudata *cpudata, u8 epp) ++static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) + { +- int ret; ++ struct amd_cpudata *cpudata = policy->driver_data; + struct cppc_perf_ctrls perf_ctrls; ++ int ret; ++ ++ if (trace_amd_pstate_epp_perf_enabled()) { ++ union perf_cached perf = cpudata->perf; ++ ++ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, ++ epp, ++ FIELD_GET(AMD_CPPC_MIN_PERF_MASK, ++ cpudata->cppc_req_cached), ++ FIELD_GET(AMD_CPPC_MAX_PERF_MASK, ++ cpudata->cppc_req_cached), ++ policy->boost_enabled, ++ epp != cpudata->epp_cached); ++ } + + if (epp == cpudata->epp_cached) + return 0; +@@ -339,17 +381,7 @@ static int amd_pstate_set_energy_pref_in + return -EBUSY; + } + +- if (trace_amd_pstate_epp_perf_enabled()) { +- union perf_cached perf = READ_ONCE(cpudata->perf); +- +- trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, +- epp, +- FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), +- FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), +- policy->boost_enabled); +- } +- +- return amd_pstate_set_epp(cpudata, epp); ++ return amd_pstate_set_epp(policy, epp); + } + + static inline int msr_cppc_enable(bool enable) +@@ -492,15 +524,16 @@ static inline int amd_pstate_init_perf(s + return static_call(amd_pstate_init_perf)(cpudata); + } + +-static int shmem_update_perf(struct amd_cpudata *cpudata, u8 min_perf, ++static int shmem_update_perf(struct cpufreq_policy *policy, u8 min_perf, + u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) + { ++ struct amd_cpudata *cpudata = policy->driver_data; + struct cppc_perf_ctrls perf_ctrls; + u64 value, prev; + int ret; + + if (cppc_state == AMD_PSTATE_ACTIVE) { +- int ret = shmem_set_epp(cpudata, epp); ++ int ret = shmem_set_epp(policy, epp); + + if (ret) + return ret; +@@ -515,6 +548,18 @@ static int shmem_update_perf(struct amd_ + value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + ++ if (trace_amd_pstate_epp_perf_enabled()) { ++ union perf_cached perf = READ_ONCE(cpudata->perf); ++ ++ trace_amd_pstate_epp_perf(cpudata->cpu, ++ perf.highest_perf, ++ epp, ++ min_perf, ++ max_perf, ++ policy->boost_enabled, ++ value != prev); ++ } ++ + if (value == prev) + return 0; + +@@ -592,7 +637,7 @@ static void amd_pstate_update(struct amd + cpudata->cpu, fast_switch); + } + +- amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch); ++ amd_pstate_update_perf(policy, min_perf, des_perf, max_perf, 0, fast_switch); + } + + static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) +@@ -1525,7 +1570,7 @@ static int amd_pstate_epp_cpu_init(struc + return ret; + WRITE_ONCE(cpudata->cppc_req_cached, value); + } +- ret = amd_pstate_set_epp(cpudata, cpudata->epp_default); ++ ret = amd_pstate_set_epp(policy, cpudata->epp_default); + if (ret) + return ret; + +@@ -1566,14 +1611,8 @@ static int amd_pstate_epp_update_limit(s + epp = READ_ONCE(cpudata->epp_cached); + + perf = READ_ONCE(cpudata->perf); +- if (trace_amd_pstate_epp_perf_enabled()) { +- trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, epp, +- perf.min_limit_perf, +- perf.max_limit_perf, +- policy->boost_enabled); +- } + +- return amd_pstate_update_perf(cpudata, perf.min_limit_perf, 0U, ++ return amd_pstate_update_perf(policy, perf.min_limit_perf, 0U, + perf.max_limit_perf, epp, false); + } + +@@ -1605,20 +1644,12 @@ static int amd_pstate_epp_set_policy(str + + static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) + { +- struct amd_cpudata *cpudata = policy->driver_data; +- union perf_cached perf = READ_ONCE(cpudata->perf); + int ret; + + ret = amd_pstate_cppc_enable(true); + if (ret) + pr_err("failed to enable amd pstate during resume, return %d\n", ret); + +- if (trace_amd_pstate_epp_perf_enabled()) { +- trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, +- cpudata->epp_cached, +- FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), +- perf.highest_perf, policy->boost_enabled); +- } + + return amd_pstate_epp_update_limit(policy); + } +@@ -1646,14 +1677,7 @@ static int amd_pstate_epp_cpu_offline(st + if (cpudata->suspended) + return 0; + +- if (trace_amd_pstate_epp_perf_enabled()) { +- trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, +- AMD_CPPC_EPP_BALANCE_POWERSAVE, +- perf.lowest_perf, perf.lowest_perf, +- policy->boost_enabled); +- } +- +- return amd_pstate_update_perf(cpudata, perf.lowest_perf, 0, perf.lowest_perf, ++ return amd_pstate_update_perf(policy, perf.lowest_perf, 0, perf.lowest_perf, + AMD_CPPC_EPP_BALANCE_POWERSAVE, false); + } + diff --git a/debian/patches/patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Update-cppc_req_cached-for-shared.patch b/debian/patches/patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Update-cppc_req_cached-for-shared.patch new file mode 100644 index 0000000..efb3248 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Update-cppc_req_cached-for-shared.patch @@ -0,0 +1,37 @@ +From eaf7b28995ee0346be8ac59869645e975eb6a91c Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:30 -0600 +Subject: cpufreq/amd-pstate: Update cppc_req_cached for shared mem EPP writes + +On EPP only writes update the cached variable so that the min/max +performance controls don't need to be updated again. + +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -336,6 +336,7 @@ static int shmem_set_epp(struct cpufreq_ + { + struct amd_cpudata *cpudata = policy->driver_data; + struct cppc_perf_ctrls perf_ctrls; ++ u64 value; + int ret; + + if (trace_amd_pstate_epp_perf_enabled()) { +@@ -362,6 +363,11 @@ static int shmem_set_epp(struct cpufreq_ + } + WRITE_ONCE(cpudata->epp_cached, epp); + ++ value = READ_ONCE(cpudata->cppc_req_cached); ++ value &= ~AMD_CPPC_EPP_PERF_MASK; ++ value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); ++ WRITE_ONCE(cpudata->cppc_req_cached, value); ++ + return ret; + } + diff --git a/debian/patches/patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-debug-statements-for-policy-.patch b/debian/patches/patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-debug-statements-for-policy-.patch new file mode 100644 index 0000000..c10b4af --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-debug-statements-for-policy-.patch @@ -0,0 +1,38 @@ +From a2ec1d51a050afc3a6d3ce35412d082e916e7eef Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:31 -0600 +Subject: cpufreq/amd-pstate: Drop debug statements for policy setting + +There are trace events that exist now for all amd-pstate modes that +will output information right before programming to the hardware. + +This makes the existing debug statements unnecessary remaining +overhead. Drop them. + +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -667,7 +667,6 @@ static int amd_pstate_verify(struct cpuf + } + + cpufreq_verify_within_cpu_limits(policy_data); +- pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min); + + return 0; + } +@@ -1630,9 +1629,6 @@ static int amd_pstate_epp_set_policy(str + if (!policy->cpuinfo.max_freq) + return -ENODEV; + +- pr_debug("set_policy: cpuinfo.max %u policy->max %u\n", +- policy->cpuinfo.max_freq, policy->max); +- + cpudata->policy = policy->policy; + + ret = amd_pstate_epp_update_limit(policy); diff --git a/debian/patches/patchset-pf/amd-pstate/0027-cpufreq-amd-pstate-Rework-CPPC-enabling.patch b/debian/patches/patchset-pf/amd-pstate/0027-cpufreq-amd-pstate-Rework-CPPC-enabling.patch new file mode 100644 index 0000000..d8a4cfe --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0027-cpufreq-amd-pstate-Rework-CPPC-enabling.patch @@ -0,0 +1,327 @@ +From 3a840f6d42aba96e1974857c157cab2f9c220045 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:32 -0600 +Subject: cpufreq/amd-pstate: Rework CPPC enabling + +The CPPC enable register is configured as "write once". That is +any future writes don't actually do anything. + +Because of this, all the cleanup paths that currently exist for +CPPC disable are non-effective. + +Rework CPPC enable to only enable after all the CAP registers have +been read to avoid enabling CPPC on CPUs with invalid _CPC or +unpopulated MSRs. + +As the register is write once, remove all cleanup paths as well. + +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 179 +++++++---------------------------- + 1 file changed, 35 insertions(+), 144 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -85,7 +85,6 @@ static struct cpufreq_driver *current_ps + static struct cpufreq_driver amd_pstate_driver; + static struct cpufreq_driver amd_pstate_epp_driver; + static int cppc_state = AMD_PSTATE_UNDEFINED; +-static bool cppc_enabled; + static bool amd_pstate_prefcore = true; + static struct quirk_entry *quirks; + +@@ -371,89 +370,21 @@ static int shmem_set_epp(struct cpufreq_ + return ret; + } + +-static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy, +- int pref_index) ++static inline int msr_cppc_enable(struct cpufreq_policy *policy) + { +- struct amd_cpudata *cpudata = policy->driver_data; +- u8 epp; +- +- if (!pref_index) +- epp = cpudata->epp_default; +- else +- epp = epp_values[pref_index]; +- +- if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { +- pr_debug("EPP cannot be set under performance policy\n"); +- return -EBUSY; +- } +- +- return amd_pstate_set_epp(policy, epp); +-} +- +-static inline int msr_cppc_enable(bool enable) +-{ +- int ret, cpu; +- unsigned long logical_proc_id_mask = 0; +- +- /* +- * MSR_AMD_CPPC_ENABLE is write-once, once set it cannot be cleared. +- */ +- if (!enable) +- return 0; +- +- if (enable == cppc_enabled) +- return 0; +- +- for_each_present_cpu(cpu) { +- unsigned long logical_id = topology_logical_package_id(cpu); +- +- if (test_bit(logical_id, &logical_proc_id_mask)) +- continue; +- +- set_bit(logical_id, &logical_proc_id_mask); +- +- ret = wrmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_ENABLE, +- enable); +- if (ret) +- return ret; +- } +- +- cppc_enabled = enable; +- return 0; ++ return wrmsrl_safe_on_cpu(policy->cpu, MSR_AMD_CPPC_ENABLE, 1); + } + +-static int shmem_cppc_enable(bool enable) ++static int shmem_cppc_enable(struct cpufreq_policy *policy) + { +- int cpu, ret = 0; +- struct cppc_perf_ctrls perf_ctrls; +- +- if (enable == cppc_enabled) +- return 0; +- +- for_each_present_cpu(cpu) { +- ret = cppc_set_enable(cpu, enable); +- if (ret) +- return ret; +- +- /* Enable autonomous mode for EPP */ +- if (cppc_state == AMD_PSTATE_ACTIVE) { +- /* Set desired perf as zero to allow EPP firmware control */ +- perf_ctrls.desired_perf = 0; +- ret = cppc_set_perf(cpu, &perf_ctrls); +- if (ret) +- return ret; +- } +- } +- +- cppc_enabled = enable; +- return ret; ++ return cppc_set_enable(policy->cpu, 1); + } + + DEFINE_STATIC_CALL(amd_pstate_cppc_enable, msr_cppc_enable); + +-static inline int amd_pstate_cppc_enable(bool enable) ++static inline int amd_pstate_cppc_enable(struct cpufreq_policy *policy) + { +- return static_call(amd_pstate_cppc_enable)(enable); ++ return static_call(amd_pstate_cppc_enable)(policy); + } + + static int msr_init_perf(struct amd_cpudata *cpudata) +@@ -1063,6 +994,10 @@ static int amd_pstate_cpu_init(struct cp + cpudata->nominal_freq, + perf.highest_perf); + ++ ret = amd_pstate_cppc_enable(policy); ++ if (ret) ++ goto free_cpudata1; ++ + policy->boost_enabled = READ_ONCE(cpudata->boost_supported); + + /* It will be updated by governor */ +@@ -1110,28 +1045,6 @@ static void amd_pstate_cpu_exit(struct c + kfree(cpudata); + } + +-static int amd_pstate_cpu_resume(struct cpufreq_policy *policy) +-{ +- int ret; +- +- ret = amd_pstate_cppc_enable(true); +- if (ret) +- pr_err("failed to enable amd-pstate during resume, return %d\n", ret); +- +- return ret; +-} +- +-static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy) +-{ +- int ret; +- +- ret = amd_pstate_cppc_enable(false); +- if (ret) +- pr_err("failed to disable amd-pstate during suspend, return %d\n", ret); +- +- return ret; +-} +- + /* Sysfs attributes */ + + /* +@@ -1223,8 +1136,10 @@ static ssize_t show_energy_performance_a + static ssize_t store_energy_performance_preference( + struct cpufreq_policy *policy, const char *buf, size_t count) + { ++ struct amd_cpudata *cpudata = policy->driver_data; + char str_preference[21]; + ssize_t ret; ++ u8 epp; + + ret = sscanf(buf, "%20s", str_preference); + if (ret != 1) +@@ -1234,7 +1149,17 @@ static ssize_t store_energy_performance_ + if (ret < 0) + return -EINVAL; + +- ret = amd_pstate_set_energy_pref_index(policy, ret); ++ if (!ret) ++ epp = cpudata->epp_default; ++ else ++ epp = epp_values[ret]; ++ ++ if (epp > 0 && policy->policy == CPUFREQ_POLICY_PERFORMANCE) { ++ pr_debug("EPP cannot be set under performance policy\n"); ++ return -EBUSY; ++ } ++ ++ ret = amd_pstate_set_epp(policy, epp); + + return ret ? ret : count; + } +@@ -1267,7 +1192,6 @@ static ssize_t show_energy_performance_p + + static void amd_pstate_driver_cleanup(void) + { +- amd_pstate_cppc_enable(false); + cppc_state = AMD_PSTATE_DISABLE; + current_pstate_driver = NULL; + } +@@ -1301,14 +1225,6 @@ static int amd_pstate_register_driver(in + + cppc_state = mode; + +- ret = amd_pstate_cppc_enable(true); +- if (ret) { +- pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n", +- ret); +- amd_pstate_driver_cleanup(); +- return ret; +- } +- + /* at least one CPU supports CPB */ + current_pstate_driver->boost_enabled = cpu_feature_enabled(X86_FEATURE_CPB); + +@@ -1548,11 +1464,15 @@ static int amd_pstate_epp_cpu_init(struc + policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, + cpudata->nominal_freq, + perf.highest_perf); ++ policy->driver_data = cpudata; ++ ++ ret = amd_pstate_cppc_enable(policy); ++ if (ret) ++ goto free_cpudata1; + + /* It will be updated by governor */ + policy->cur = policy->cpuinfo.min_freq; + +- policy->driver_data = cpudata; + + policy->boost_enabled = READ_ONCE(cpudata->boost_supported); + +@@ -1644,31 +1564,11 @@ static int amd_pstate_epp_set_policy(str + return 0; + } + +-static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) +-{ +- int ret; +- +- ret = amd_pstate_cppc_enable(true); +- if (ret) +- pr_err("failed to enable amd pstate during resume, return %d\n", ret); +- +- +- return amd_pstate_epp_update_limit(policy); +-} +- + static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) + { +- struct amd_cpudata *cpudata = policy->driver_data; +- int ret; +- +- pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); ++ pr_debug("AMD CPU Core %d going online\n", policy->cpu); + +- ret = amd_pstate_epp_reenable(policy); +- if (ret) +- return ret; +- cpudata->suspended = false; +- +- return 0; ++ return amd_pstate_cppc_enable(policy); + } + + static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) +@@ -1686,11 +1586,6 @@ static int amd_pstate_epp_cpu_offline(st + static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- int ret; +- +- /* avoid suspending when EPP is not enabled */ +- if (cppc_state != AMD_PSTATE_ACTIVE) +- return 0; + + /* invalidate to ensure it's rewritten during resume */ + cpudata->cppc_req_cached = 0; +@@ -1698,11 +1593,6 @@ static int amd_pstate_epp_suspend(struct + /* set this flag to avoid setting core offline*/ + cpudata->suspended = true; + +- /* disable CPPC in lowlevel firmware */ +- ret = amd_pstate_cppc_enable(false); +- if (ret) +- pr_err("failed to suspend, return %d\n", ret); +- + return 0; + } + +@@ -1711,8 +1601,12 @@ static int amd_pstate_epp_resume(struct + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata->suspended) { ++ int ret; ++ + /* enable amd pstate from suspend state*/ +- amd_pstate_epp_reenable(policy); ++ ret = amd_pstate_epp_update_limit(policy); ++ if (ret) ++ return ret; + + cpudata->suspended = false; + } +@@ -1727,8 +1621,6 @@ static struct cpufreq_driver amd_pstate_ + .fast_switch = amd_pstate_fast_switch, + .init = amd_pstate_cpu_init, + .exit = amd_pstate_cpu_exit, +- .suspend = amd_pstate_cpu_suspend, +- .resume = amd_pstate_cpu_resume, + .set_boost = amd_pstate_set_boost, + .update_limits = amd_pstate_update_limits, + .name = "amd-pstate", +@@ -1895,7 +1787,6 @@ static int __init amd_pstate_init(void) + + global_attr_free: + cpufreq_unregister_driver(current_pstate_driver); +- amd_pstate_cppc_enable(false); + return ret; + } + device_initcall(amd_pstate_init); diff --git a/debian/patches/patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Stop-caching-EPP.patch b/debian/patches/patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Stop-caching-EPP.patch new file mode 100644 index 0000000..075bb79 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Stop-caching-EPP.patch @@ -0,0 +1,105 @@ +From 5fda2a5a547244c99bce9327e77e2ff253f77add Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:33 -0600 +Subject: cpufreq/amd-pstate: Stop caching EPP + +EPP values are cached in the cpudata structure per CPU. This is needless +though because they are also cached in the CPPC request variable. + +Drop the separate cache for EPP values and always reference the CPPC +request variable when needed. + +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 19 ++++++++++--------- + drivers/cpufreq/amd-pstate.h | 1 - + 2 files changed, 10 insertions(+), 10 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -268,8 +268,6 @@ static int msr_update_perf(struct cpufre + } + + WRITE_ONCE(cpudata->cppc_req_cached, value); +- if (epp != cpudata->epp_cached) +- WRITE_ONCE(cpudata->epp_cached, epp); + + return 0; + } +@@ -318,7 +316,6 @@ static int msr_set_epp(struct cpufreq_po + } + + /* update both so that msr_update_perf() can effectively check */ +- WRITE_ONCE(cpudata->epp_cached, epp); + WRITE_ONCE(cpudata->cppc_req_cached, value); + + return ret; +@@ -335,9 +332,12 @@ static int shmem_set_epp(struct cpufreq_ + { + struct amd_cpudata *cpudata = policy->driver_data; + struct cppc_perf_ctrls perf_ctrls; ++ u8 epp_cached; + u64 value; + int ret; + ++ ++ epp_cached = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); + if (trace_amd_pstate_epp_perf_enabled()) { + union perf_cached perf = cpudata->perf; + +@@ -348,10 +348,10 @@ static int shmem_set_epp(struct cpufreq_ + FIELD_GET(AMD_CPPC_MAX_PERF_MASK, + cpudata->cppc_req_cached), + policy->boost_enabled, +- epp != cpudata->epp_cached); ++ epp != epp_cached); + } + +- if (epp == cpudata->epp_cached) ++ if (epp == epp_cached) + return 0; + + perf_ctrls.energy_perf = epp; +@@ -360,7 +360,6 @@ static int shmem_set_epp(struct cpufreq_ + pr_debug("failed to set energy perf value (%d)\n", ret); + return ret; + } +- WRITE_ONCE(cpudata->epp_cached, epp); + + value = READ_ONCE(cpudata->cppc_req_cached); + value &= ~AMD_CPPC_EPP_PERF_MASK; +@@ -1168,9 +1167,11 @@ static ssize_t show_energy_performance_p + struct cpufreq_policy *policy, char *buf) + { + struct amd_cpudata *cpudata = policy->driver_data; +- u8 preference; ++ u8 preference, epp; ++ ++ epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); + +- switch (cpudata->epp_cached) { ++ switch (epp) { + case AMD_CPPC_EPP_PERFORMANCE: + preference = EPP_INDEX_PERFORMANCE; + break; +@@ -1533,7 +1534,7 @@ static int amd_pstate_epp_update_limit(s + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + epp = 0; + else +- epp = READ_ONCE(cpudata->epp_cached); ++ epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); + + perf = READ_ONCE(cpudata->perf); + +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -102,7 +102,6 @@ struct amd_cpudata { + bool hw_prefcore; + + /* EPP feature related attributes*/ +- u8 epp_cached; + u32 policy; + bool suspended; + u8 epp_default; diff --git a/debian/patches/patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Drop-actions-in-amd_pstate_epp_cp.patch b/debian/patches/patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Drop-actions-in-amd_pstate_epp_cp.patch new file mode 100644 index 0000000..6fe1cd5 --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Drop-actions-in-amd_pstate_epp_cp.patch @@ -0,0 +1,39 @@ +From 7757237a6ee08403e9a0e58eebf53ae2203f65ae Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <mario.limonciello@amd.com> +Date: Wed, 26 Feb 2025 01:49:34 -0600 +Subject: cpufreq/amd-pstate: Drop actions in amd_pstate_epp_cpu_offline() + +When the CPU goes offline there is no need to change the CPPC request +because the CPU will go into the deepest C-state it supports already. + +Actually changing the CPPC request when it goes offline messes up the +cached values and can lead to the wrong values being restored when +it comes back. + +Instead drop the actions and if the CPU comes back online let +amd_pstate_epp_set_policy() restore it to expected values. + +Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com> +Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> +--- + drivers/cpufreq/amd-pstate.c | 9 +-------- + 1 file changed, 1 insertion(+), 8 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -1574,14 +1574,7 @@ static int amd_pstate_epp_cpu_online(str + + static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) + { +- struct amd_cpudata *cpudata = policy->driver_data; +- union perf_cached perf = READ_ONCE(cpudata->perf); +- +- if (cpudata->suspended) +- return 0; +- +- return amd_pstate_update_perf(policy, perf.lowest_perf, 0, perf.lowest_perf, +- AMD_CPPC_EPP_BALANCE_POWERSAVE, false); ++ return 0; + } + + static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) diff --git a/debian/patches/patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-fix-warning-noticed-by-kernel-tes.patch b/debian/patches/patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-fix-warning-noticed-by-kernel-tes.patch new file mode 100644 index 0000000..e9adf6d --- /dev/null +++ b/debian/patches/patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-fix-warning-noticed-by-kernel-tes.patch @@ -0,0 +1,41 @@ +From f25d506d1e54b7d0a5fe42284cd5f2ca5c21cef7 Mon Sep 17 00:00:00 2001 +From: Mario Limonciello <superm1@kernel.org> +Date: Thu, 27 Feb 2025 14:09:08 -0600 +Subject: cpufreq/amd-pstate: fix warning noticed by kernel test robot + +Reported-by: kernel test robot <lkp@intel.com> +Closes: https://lore.kernel.org/oe-kbuild-all/202502272001.nafS0qXq-lkp@intel.com/ +Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +--- + drivers/cpufreq/amd-pstate.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -903,20 +903,19 @@ static int amd_pstate_init_freq(struct a + return ret; + perf = READ_ONCE(cpudata->perf); + ++ if (quirks && quirks->nominal_freq) ++ nominal_freq = quirks->nominal_freq; ++ else ++ nominal_freq = cppc_perf.nominal_freq; ++ nominal_freq *= 1000; ++ + if (quirks && quirks->lowest_freq) { + min_freq = quirks->lowest_freq; + perf.lowest_perf = freq_to_perf(perf, nominal_freq, min_freq); + WRITE_ONCE(cpudata->perf, perf); + } else + min_freq = cppc_perf.lowest_freq; +- +- if (quirks && quirks->nominal_freq) +- nominal_freq = quirks->nominal_freq; +- else +- nominal_freq = cppc_perf.nominal_freq; +- + min_freq *= 1000; +- nominal_freq *= 1000; + + WRITE_ONCE(cpudata->nominal_freq, nominal_freq); + diff --git a/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch b/debian/patches/patchset-pf/cpuidle/0001-cpuidle-Prefer-teo-over-menu-governor.patch similarity index 94% rename from debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch rename to debian/patches/patchset-pf/cpuidle/0001-cpuidle-Prefer-teo-over-menu-governor.patch index fadbf0b..ca9b0c7 100644 --- a/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch +++ b/debian/patches/patchset-pf/cpuidle/0001-cpuidle-Prefer-teo-over-menu-governor.patch @@ -1,4 +1,4 @@ -From 4c13cc86a7d9f1e88e8090a94c792eb45d7ef1ef Mon Sep 17 00:00:00 2001 +From 7a0fbf076914b2b0e55feddd839212af92bdffb3 Mon Sep 17 00:00:00 2001 From: Christian Loehle <christian.loehle@arm.com> Date: Thu, 5 Sep 2024 10:26:39 +0100 Subject: cpuidle: Prefer teo over menu governor @@ -47,7 +47,7 @@ Signed-off-by: Christian Loehle <christian.loehle@arm.com> .reflect = menu_reflect, --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c -@@ -542,7 +542,7 @@ static int teo_enable_device(struct cpui +@@ -537,7 +537,7 @@ static int teo_enable_device(struct cpui static struct cpuidle_governor teo_governor = { .name = "teo", diff --git a/debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch b/debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch deleted file mode 100644 index 85c5208..0000000 --- a/debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch +++ /dev/null @@ -1,189 +0,0 @@ -From 3d722d5259babc1650ab6cb1a8bbf27863af75f2 Mon Sep 17 00:00:00 2001 -From: Christian Loehle <christian.loehle@arm.com> -Date: Thu, 5 Sep 2024 10:26:38 +0100 -Subject: cpuidle: menu: Remove iowait influence - -Remove CPU iowaiters influence on idle state selection. - -Remove the menu notion of performance multiplier which increased with -the number of tasks that went to iowait sleep on this CPU and haven't -woken up yet. - -Relying on iowait for cpuidle is problematic for a few reasons: - - 1. There is no guarantee that an iowaiting task will wake up on the - same CPU. - - 2. The task being in iowait says nothing about the idle duration, we - could be selecting shallower states for a long time. - - 3. The task being in iowait doesn't always imply a performance hit - with increased latency. - - 4. If there is such a performance hit, the number of iowaiting tasks - doesn't directly correlate. - - 5. The definition of iowait altogether is vague at best, it is - sprinkled across kernel code. - -Signed-off-by: Christian Loehle <christian.loehle@arm.com> -Link: https://patch.msgid.link/20240905092645.2885200-2-christian.loehle@arm.com -[ rjw: Minor edits in the changelog ] -Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> ---- - drivers/cpuidle/governors/menu.c | 76 ++++---------------------------- - 1 file changed, 9 insertions(+), 67 deletions(-) - ---- a/drivers/cpuidle/governors/menu.c -+++ b/drivers/cpuidle/governors/menu.c -@@ -19,7 +19,7 @@ - - #include "gov.h" - --#define BUCKETS 12 -+#define BUCKETS 6 - #define INTERVAL_SHIFT 3 - #define INTERVALS (1UL << INTERVAL_SHIFT) - #define RESOLUTION 1024 -@@ -29,12 +29,11 @@ - /* - * Concepts and ideas behind the menu governor - * -- * For the menu governor, there are 3 decision factors for picking a C -+ * For the menu governor, there are 2 decision factors for picking a C - * state: - * 1) Energy break even point -- * 2) Performance impact -- * 3) Latency tolerance (from pmqos infrastructure) -- * These three factors are treated independently. -+ * 2) Latency tolerance (from pmqos infrastructure) -+ * These two factors are treated independently. - * - * Energy break even point - * ----------------------- -@@ -75,30 +74,6 @@ - * intervals and if the stand deviation of these 8 intervals is below a - * threshold value, we use the average of these intervals as prediction. - * -- * Limiting Performance Impact -- * --------------------------- -- * C states, especially those with large exit latencies, can have a real -- * noticeable impact on workloads, which is not acceptable for most sysadmins, -- * and in addition, less performance has a power price of its own. -- * -- * As a general rule of thumb, menu assumes that the following heuristic -- * holds: -- * The busier the system, the less impact of C states is acceptable -- * -- * This rule-of-thumb is implemented using a performance-multiplier: -- * If the exit latency times the performance multiplier is longer than -- * the predicted duration, the C state is not considered a candidate -- * for selection due to a too high performance impact. So the higher -- * this multiplier is, the longer we need to be idle to pick a deep C -- * state, and thus the less likely a busy CPU will hit such a deep -- * C state. -- * -- * Currently there is only one value determining the factor: -- * 10 points are added for each process that is waiting for IO on this CPU. -- * (This value was experimentally determined.) -- * Utilization is no longer a factor as it was shown that it never contributed -- * significantly to the performance multiplier in the first place. -- * - */ - - struct menu_device { -@@ -112,19 +87,10 @@ struct menu_device { - int interval_ptr; - }; - --static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) -+static inline int which_bucket(u64 duration_ns) - { - int bucket = 0; - -- /* -- * We keep two groups of stats; one with no -- * IO pending, one without. -- * This allows us to calculate -- * E(duration)|iowait -- */ -- if (nr_iowaiters) -- bucket = BUCKETS/2; -- - if (duration_ns < 10ULL * NSEC_PER_USEC) - return bucket; - if (duration_ns < 100ULL * NSEC_PER_USEC) -@@ -138,19 +104,6 @@ static inline int which_bucket(u64 durat - return bucket + 5; - } - --/* -- * Return a multiplier for the exit latency that is intended -- * to take performance requirements into account. -- * The more performance critical we estimate the system -- * to be, the higher this multiplier, and thus the higher -- * the barrier to go to an expensive C state. -- */ --static inline int performance_multiplier(unsigned int nr_iowaiters) --{ -- /* for IO wait tasks (per cpu!) we add 10x each */ -- return 1 + 10 * nr_iowaiters; --} -- - static DEFINE_PER_CPU(struct menu_device, menu_devices); - - static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); -@@ -258,8 +211,6 @@ static int menu_select(struct cpuidle_dr - struct menu_device *data = this_cpu_ptr(&menu_devices); - s64 latency_req = cpuidle_governor_latency_req(dev->cpu); - u64 predicted_ns; -- u64 interactivity_req; -- unsigned int nr_iowaiters; - ktime_t delta, delta_tick; - int i, idx; - -@@ -268,8 +219,6 @@ static int menu_select(struct cpuidle_dr - data->needs_update = 0; - } - -- nr_iowaiters = nr_iowait_cpu(dev->cpu); -- - /* Find the shortest expected idle interval. */ - predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; - if (predicted_ns > RESIDENCY_THRESHOLD_NS) { -@@ -283,7 +232,7 @@ static int menu_select(struct cpuidle_dr - } - - data->next_timer_ns = delta; -- data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); -+ data->bucket = which_bucket(data->next_timer_ns); - - /* Round up the result for half microseconds. */ - timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + -@@ -301,7 +250,7 @@ static int menu_select(struct cpuidle_dr - */ - data->next_timer_ns = KTIME_MAX; - delta_tick = TICK_NSEC / 2; -- data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); -+ data->bucket = which_bucket(KTIME_MAX); - } - - if (unlikely(drv->state_count <= 1 || latency_req == 0) || -@@ -328,15 +277,8 @@ static int menu_select(struct cpuidle_dr - */ - if (predicted_ns < TICK_NSEC) - predicted_ns = data->next_timer_ns; -- } else { -- /* -- * Use the performance multiplier and the user-configurable -- * latency_req to determine the maximum exit latency. -- */ -- interactivity_req = div64_u64(predicted_ns, -- performance_multiplier(nr_iowaiters)); -- if (latency_req > interactivity_req) -- latency_req = interactivity_req; -+ } else if (latency_req > predicted_ns) { -+ latency_req = predicted_ns; - } - - /* diff --git a/debian/patches/patchset-pf/crypto/0001-crypto-x86-aes-xts-make-the-fast-path-64-bit-specifi.patch b/debian/patches/patchset-pf/crypto/0001-crypto-x86-aes-xts-make-the-fast-path-64-bit-specifi.patch new file mode 100644 index 0000000..6cd4781 --- /dev/null +++ b/debian/patches/patchset-pf/crypto/0001-crypto-x86-aes-xts-make-the-fast-path-64-bit-specifi.patch @@ -0,0 +1,65 @@ +From 594316efc465f1408482e0d1dd379f4e3a6a5c7c Mon Sep 17 00:00:00 2001 +From: Eric Biggers <ebiggers@google.com> +Date: Mon, 27 Jan 2025 13:16:09 -0800 +Subject: crypto: x86/aes-xts - make the fast path 64-bit specific + +Remove 32-bit support from the fast path in xts_crypt(). Then optimize +it for 64-bit, and simplify the code, by switching to sg_virt() and +removing the now-unnecessary checks for crossing a page boundary. + +The result is simpler code that is slightly smaller and faster in the +case that actually matters (64-bit). + +Signed-off-by: Eric Biggers <ebiggers@google.com> +--- + arch/x86/crypto/aesni-intel_glue.c | 30 ++++++++++-------------------- + 1 file changed, 10 insertions(+), 20 deletions(-) + +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -581,11 +581,8 @@ xts_crypt(struct skcipher_request *req, + { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); +- const unsigned int cryptlen = req->cryptlen; +- struct scatterlist *src = req->src; +- struct scatterlist *dst = req->dst; + +- if (unlikely(cryptlen < AES_BLOCK_SIZE)) ++ if (unlikely(req->cryptlen < AES_BLOCK_SIZE)) + return -EINVAL; + + kernel_fpu_begin(); +@@ -593,23 +590,16 @@ xts_crypt(struct skcipher_request *req, + + /* + * In practice, virtually all XTS plaintexts and ciphertexts are either +- * 512 or 4096 bytes, aligned such that they don't span page boundaries. +- * To optimize the performance of these cases, and also any other case +- * where no page boundary is spanned, the below fast-path handles +- * single-page sources and destinations as efficiently as possible. ++ * 512 or 4096 bytes and do not use multiple scatterlist elements. To ++ * optimize the performance of these cases, the below fast-path handles ++ * single-scatterlist-element messages as efficiently as possible. The ++ * code is 64-bit specific, as it assumes no page mapping is needed. + */ +- if (likely(src->length >= cryptlen && dst->length >= cryptlen && +- src->offset + cryptlen <= PAGE_SIZE && +- dst->offset + cryptlen <= PAGE_SIZE)) { +- struct page *src_page = sg_page(src); +- struct page *dst_page = sg_page(dst); +- void *src_virt = kmap_local_page(src_page) + src->offset; +- void *dst_virt = kmap_local_page(dst_page) + dst->offset; +- +- (*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen, +- req->iv); +- kunmap_local(dst_virt); +- kunmap_local(src_virt); ++ if (IS_ENABLED(CONFIG_X86_64) && ++ likely(req->src->length >= req->cryptlen && ++ req->dst->length >= req->cryptlen)) { ++ (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src), ++ sg_virt(req->dst), req->cryptlen, req->iv); + kernel_fpu_end(); + return 0; + } diff --git a/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch b/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch deleted file mode 100644 index d20cf05..0000000 --- a/debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch +++ /dev/null @@ -1,181 +0,0 @@ -From 0a957679a29a06fb2e3971615ff9f05f6becb941 Mon Sep 17 00:00:00 2001 -From: Eric Biggers <ebiggers@google.com> -Date: Sun, 13 Oct 2024 21:06:49 -0700 -Subject: crypto: x86/crc32c - simplify code for handling fewer than 200 bytes - -The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for -lengths >= 512, due to the overhead of saving and restoring FPU state. -Therefore, it is unnecessary for this code to be excessively "optimized" -for lengths < 200. Eliminate the excessive unrolling of this part of -the code and use a more straightforward qword-at-a-time loop. - -Note: the part of the code in question is not entirely redundant, as it -is still used to process any remainder mod 24, as well as any remaining -data when fewer than 200 bytes remain after least one 3072-byte chunk. - -Signed-off-by: Eric Biggers <ebiggers@google.com> ---- - arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++---------------- - 1 file changed, 33 insertions(+), 83 deletions(-) - ---- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S -+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S -@@ -56,20 +56,10 @@ - .quad .Lcrc_\i - .endm - --.macro JNC_LESS_THAN j -- jnc .Lless_than_\j --.endm -- --# Define threshold where buffers are considered "small" and routed to more --# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so --# SMALL_SIZE can be no larger than 255. -- -+# Define threshold below which buffers are considered "small" and routed to -+# regular CRC code that does not interleave the CRC instructions. - #define SMALL_SIZE 200 - --.if (SMALL_SIZE > 255) --.error "SMALL_ SIZE must be < 256" --.endif -- - # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); - - .text -@@ -100,25 +90,18 @@ SYM_FUNC_START(crc_pcl) - ## Move crc_init for Linux to a different - mov crc_init_arg, crc_init - -+ mov %bufp, bufptmp # rdi = *buf -+ cmp $SMALL_SIZE, len -+ jb .Lsmall -+ - ################################################################ - ## 1) ALIGN: - ################################################################ -- -- mov %bufp, bufptmp # rdi = *buf - neg %bufp - and $7, %bufp # calculate the unalignment amount of - # the address - je .Lproc_block # Skip if aligned - -- ## If len is less than 8 and we're unaligned, we need to jump -- ## to special code to avoid reading beyond the end of the buffer -- cmp $8, len -- jae .Ldo_align -- # less_than_8 expects length in upper 3 bits of len_dw -- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] -- shl $32-3+1, len_dw -- jmp .Lless_than_8_post_shl1 -- - .Ldo_align: - #### Calculate CRC of unaligned bytes of the buffer (if any) - movq (bufptmp), tmp # load a quadward from the buffer -@@ -144,9 +127,6 @@ SYM_FUNC_START(crc_pcl) - jae .Lfull_block - - .Lcontinue_block: -- cmpq $SMALL_SIZE, len -- jb .Lsmall -- - ## len < 128*24 - movq $2731, %rax # 2731 = ceil(2^16 / 24) - mul len_dw -@@ -243,68 +223,38 @@ LABEL crc_ 0 - mov tmp, len - cmp $128*24, tmp - jae .Lfull_block -- cmp $24, tmp -+ cmp $SMALL_SIZE, tmp - jae .Lcontinue_block - --.Lless_than_24: -- shl $32-4, len_dw # less_than_16 expects length -- # in upper 4 bits of len_dw -- jnc .Lless_than_16 -- crc32q (bufptmp), crc_init -- crc32q 8(bufptmp), crc_init -- jz .Ldo_return -- add $16, bufptmp -- # len is less than 8 if we got here -- # less_than_8 expects length in upper 3 bits of len_dw -- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] -- shl $2, len_dw -- jmp .Lless_than_8_post_shl1 -- - ####################################################################### -- ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) -+ ## 6) Process any remainder without interleaving: - ####################################################################### - .Lsmall: -- shl $32-8, len_dw # Prepare len_dw for less_than_256 -- j=256 --.rept 5 # j = {256, 128, 64, 32, 16} --.altmacro --LABEL less_than_ %j # less_than_j: Length should be in -- # upper lg(j) bits of len_dw -- j=(j/2) -- shl $1, len_dw # Get next MSB -- JNC_LESS_THAN %j --.noaltmacro -- i=0 --.rept (j/8) -- crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data -- i=i+8 --.endr -- jz .Ldo_return # Return if remaining length is zero -- add $j, bufptmp # Advance buf --.endr -- --.Lless_than_8: # Length should be stored in -- # upper 3 bits of len_dw -- shl $1, len_dw --.Lless_than_8_post_shl1: -- jnc .Lless_than_4 -- crc32l (bufptmp), crc_init_dw # CRC of 4 bytes -- jz .Ldo_return # return if remaining data is zero -- add $4, bufptmp --.Lless_than_4: # Length should be stored in -- # upper 2 bits of len_dw -- shl $1, len_dw -- jnc .Lless_than_2 -- crc32w (bufptmp), crc_init_dw # CRC of 2 bytes -- jz .Ldo_return # return if remaining data is zero -- add $2, bufptmp --.Lless_than_2: # Length should be stored in the MSB -- # of len_dw -- shl $1, len_dw -- jnc .Lless_than_1 -- crc32b (bufptmp), crc_init_dw # CRC of 1 byte --.Lless_than_1: # Length should be zero --.Ldo_return: -+ test len, len -+ jz .Ldone -+ mov len_dw, %eax -+ shr $3, %eax -+ jz .Ldo_dword -+.Ldo_qwords: -+ crc32q (bufptmp), crc_init -+ add $8, bufptmp -+ dec %eax -+ jnz .Ldo_qwords -+.Ldo_dword: -+ test $4, len_dw -+ jz .Ldo_word -+ crc32l (bufptmp), crc_init_dw -+ add $4, bufptmp -+.Ldo_word: -+ test $2, len_dw -+ jz .Ldo_byte -+ crc32w (bufptmp), crc_init_dw -+ add $2, bufptmp -+.Ldo_byte: -+ test $1, len_dw -+ jz .Ldone -+ crc32b (bufptmp), crc_init_dw -+.Ldone: - movq crc_init, %rax - popq %rsi - popq %rdi diff --git a/debian/patches/patchset-pf/crypto/0002-crypto-x86-aes-ctr-rewrite-AESNI-AVX-optimized-CTR-a.patch b/debian/patches/patchset-pf/crypto/0002-crypto-x86-aes-ctr-rewrite-AESNI-AVX-optimized-CTR-a.patch new file mode 100644 index 0000000..2e2a475 --- /dev/null +++ b/debian/patches/patchset-pf/crypto/0002-crypto-x86-aes-ctr-rewrite-AESNI-AVX-optimized-CTR-a.patch @@ -0,0 +1,1857 @@ +From b988178e5a6498eea32891a711f065cfbe4cedf4 Mon Sep 17 00:00:00 2001 +From: Eric Biggers <ebiggers@google.com> +Date: Mon, 10 Feb 2025 08:50:20 -0800 +Subject: crypto: x86/aes-ctr - rewrite AESNI+AVX optimized CTR and add VAES + support + +Delete aes_ctrby8_avx-x86_64.S and add a new assembly file +aes-ctr-avx-x86_64.S which follows a similar approach to +aes-xts-avx-x86_64.S in that it uses a "template" to provide AESNI+AVX, +VAES+AVX2, VAES+AVX10/256, and VAES+AVX10/512 code, instead of just +AESNI+AVX. Wire it up to the crypto API accordingly. + +This greatly improves the performance of AES-CTR and AES-XCTR on +VAES-capable CPUs, with the best case being AMD Zen 5 where an over 230% +increase in throughput is seen on long messages. Performance on +non-VAES-capable CPUs remains about the same, and the non-AVX AES-CTR +code (aesni_ctr_enc) is also kept as-is for now. There are some slight +regressions (less than 10%) on some short message lengths on some CPUs; +these are difficult to avoid, given how the previous code was so heavily +unrolled by message length, and they are not particularly important. +Detailed performance results are given in the tables below. + +Both CTR and XCTR support is retained. The main loop remains +8-vector-wide, which differs from the 4-vector-wide main loops that are +used in the XTS and GCM code. A wider loop is appropriate for CTR and +XCTR since they have fewer other instructions (such as vpclmulqdq) to +interleave with the AES instructions. + +Similar to what was the case for AES-GCM, the new assembly code also has +a much smaller binary size, as it fixes the excessive unrolling by data +length and key length present in the old code. Specifically, the new +assembly file compiles to about 9 KB of text vs. 28 KB for the old file. +This is despite 4x as many implementations being included. + +The tables below show the detailed performance results. The tables show +percentage improvement in single-threaded throughput for repeated +encryption of the given message length; an increase from 6000 MB/s to +12000 MB/s would be listed as 100%. They were collected by directly +measuring the Linux crypto API performance using a custom kernel module. +The tested CPUs were all server processors from Google Compute Engine +except for Zen 5 which was a Ryzen 9 9950X desktop processor. + +Table 1: AES-256-CTR throughput improvement, + CPU microarchitecture vs. message length in bytes: + + | 16384 | 4096 | 4095 | 1420 | 512 | 500 | +---------------------+-------+-------+-------+-------+-------+-------+ +AMD Zen 5 | 232% | 203% | 212% | 143% | 71% | 95% | +Intel Emerald Rapids | 116% | 116% | 117% | 91% | 78% | 79% | +Intel Ice Lake | 109% | 103% | 107% | 81% | 54% | 56% | +AMD Zen 4 | 109% | 91% | 100% | 70% | 43% | 59% | +AMD Zen 3 | 92% | 78% | 87% | 57% | 32% | 43% | +AMD Zen 2 | 9% | 8% | 14% | 12% | 8% | 21% | +Intel Skylake | 7% | 7% | 8% | 5% | 3% | 8% | + + | 300 | 200 | 64 | 63 | 16 | +---------------------+-------+-------+-------+-------+-------+ +AMD Zen 5 | 57% | 39% | -9% | 7% | -7% | +Intel Emerald Rapids | 37% | 42% | -0% | 13% | -8% | +Intel Ice Lake | 39% | 30% | -1% | 14% | -9% | +AMD Zen 4 | 42% | 38% | -0% | 18% | -3% | +AMD Zen 3 | 38% | 35% | 6% | 31% | 5% | +AMD Zen 2 | 24% | 23% | 5% | 30% | 3% | +Intel Skylake | 9% | 1% | -4% | 10% | -7% | + +Table 2: AES-256-XCTR throughput improvement, + CPU microarchitecture vs. message length in bytes: + + | 16384 | 4096 | 4095 | 1420 | 512 | 500 | +---------------------+-------+-------+-------+-------+-------+-------+ +AMD Zen 5 | 240% | 201% | 216% | 151% | 75% | 108% | +Intel Emerald Rapids | 100% | 99% | 102% | 91% | 94% | 104% | +Intel Ice Lake | 93% | 89% | 92% | 74% | 50% | 64% | +AMD Zen 4 | 86% | 75% | 83% | 60% | 41% | 52% | +AMD Zen 3 | 73% | 63% | 69% | 45% | 21% | 33% | +AMD Zen 2 | -2% | -2% | 2% | 3% | -1% | 11% | +Intel Skylake | -1% | -1% | 1% | 2% | -1% | 9% | + + | 300 | 200 | 64 | 63 | 16 | +---------------------+-------+-------+-------+-------+-------+ +AMD Zen 5 | 78% | 56% | -4% | 38% | -2% | +Intel Emerald Rapids | 61% | 55% | 4% | 32% | -5% | +Intel Ice Lake | 57% | 42% | 3% | 44% | -4% | +AMD Zen 4 | 35% | 28% | -1% | 17% | -3% | +AMD Zen 3 | 26% | 23% | -3% | 11% | -6% | +AMD Zen 2 | 13% | 24% | -1% | 14% | -3% | +Intel Skylake | 16% | 8% | -4% | 35% | -3% | + +Signed-off-by: Eric Biggers <ebiggers@google.com> +--- + arch/x86/crypto/Makefile | 2 +- + arch/x86/crypto/aes-ctr-avx-x86_64.S | 592 +++++++++++++++++++++++ + arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 597 ------------------------ + arch/x86/crypto/aesni-intel_glue.c | 404 ++++++++-------- + 4 files changed, 803 insertions(+), 792 deletions(-) + create mode 100644 arch/x86/crypto/aes-ctr-avx-x86_64.S + delete mode 100644 arch/x86/crypto/aes_ctrby8_avx-x86_64.S + +--- a/arch/x86/crypto/Makefile ++++ b/arch/x86/crypto/Makefile +@@ -48,7 +48,7 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += cha + + obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o + aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o +-aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \ ++aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ + aes-gcm-aesni-x86_64.o \ + aes-xts-avx-x86_64.o + ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) +--- /dev/null ++++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S +@@ -0,0 +1,592 @@ ++/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ ++// ++// Copyright 2025 Google LLC ++// ++// Author: Eric Biggers <ebiggers@google.com> ++// ++// This file is dual-licensed, meaning that you can use it under your choice of ++// either of the following two licenses: ++// ++// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy ++// of the License at ++// ++// http://www.apache.org/licenses/LICENSE-2.0 ++// ++// Unless required by applicable law or agreed to in writing, software ++// distributed under the License is distributed on an "AS IS" BASIS, ++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++// See the License for the specific language governing permissions and ++// limitations under the License. ++// ++// or ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions are met: ++// ++// 1. Redistributions of source code must retain the above copyright notice, ++// this list of conditions and the following disclaimer. ++// ++// 2. Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE ++// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++// POSSIBILITY OF SUCH DAMAGE. ++// ++//------------------------------------------------------------------------------ ++// ++// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR ++// using the following sets of CPU features: ++// - AES-NI && AVX ++// - VAES && AVX2 ++// - VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2 ++// - VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2 ++// ++// See the function definitions at the bottom of the file for more information. ++ ++#include <linux/linkage.h> ++#include <linux/cfi_types.h> ++ ++.section .rodata ++.p2align 4 ++ ++.Lbswap_mask: ++ .octa 0x000102030405060708090a0b0c0d0e0f ++ ++.Lctr_pattern: ++ .quad 0, 0 ++.Lone: ++ .quad 1, 0 ++.Ltwo: ++ .quad 2, 0 ++ .quad 3, 0 ++ ++.Lfour: ++ .quad 4, 0 ++ ++.text ++ ++// Move a vector between memory and a register. ++// The register operand must be in the first 16 vector registers. ++.macro _vmovdqu src, dst ++.if VL < 64 ++ vmovdqu \src, \dst ++.else ++ vmovdqu8 \src, \dst ++.endif ++.endm ++ ++// Move a vector between registers. ++// The registers must be in the first 16 vector registers. ++.macro _vmovdqa src, dst ++.if VL < 64 ++ vmovdqa \src, \dst ++.else ++ vmovdqa64 \src, \dst ++.endif ++.endm ++ ++// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector ++// register. The register operand must be in the first 16 vector registers. ++.macro _vbroadcast128 src, dst ++.if VL == 16 ++ vmovdqu \src, \dst ++.elseif VL == 32 ++ vbroadcasti128 \src, \dst ++.else ++ vbroadcasti32x4 \src, \dst ++.endif ++.endm ++ ++// XOR two vectors together. ++// Any register operands must be in the first 16 vector registers. ++.macro _vpxor src1, src2, dst ++.if VL < 64 ++ vpxor \src1, \src2, \dst ++.else ++ vpxord \src1, \src2, \dst ++.endif ++.endm ++ ++// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst ++// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. ++.macro _load_partial_block src, dst, tmp64, tmp32 ++ sub $8, %ecx // LEN - 8 ++ jle .Lle8\@ ++ ++ // Load 9 <= LEN <= 15 bytes. ++ vmovq (\src), \dst // Load first 8 bytes ++ mov (\src, %rcx), %rax // Load last 8 bytes ++ neg %ecx ++ shl $3, %ecx ++ shr %cl, %rax // Discard overlapping bytes ++ vpinsrq $1, %rax, \dst, \dst ++ jmp .Ldone\@ ++ ++.Lle8\@: ++ add $4, %ecx // LEN - 4 ++ jl .Llt4\@ ++ ++ // Load 4 <= LEN <= 8 bytes. ++ mov (\src), %eax // Load first 4 bytes ++ mov (\src, %rcx), \tmp32 // Load last 4 bytes ++ jmp .Lcombine\@ ++ ++.Llt4\@: ++ // Load 1 <= LEN <= 3 bytes. ++ add $2, %ecx // LEN - 2 ++ movzbl (\src), %eax // Load first byte ++ jl .Lmovq\@ ++ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes ++.Lcombine\@: ++ shl $3, %ecx ++ shl %cl, \tmp64 ++ or \tmp64, %rax // Combine the two parts ++.Lmovq\@: ++ vmovq %rax, \dst ++.Ldone\@: ++.endm ++ ++// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. ++// Clobbers %rax, %rcx, and \tmp{64,32}. ++.macro _store_partial_block src, dst, tmp64, tmp32 ++ sub $8, %ecx // LEN - 8 ++ jl .Llt8\@ ++ ++ // Store 8 <= LEN <= 15 bytes. ++ vpextrq $1, \src, %rax ++ mov %ecx, \tmp32 ++ shl $3, %ecx ++ ror %cl, %rax ++ mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes ++ vmovq \src, (\dst) // Store first 8 bytes ++ jmp .Ldone\@ ++ ++.Llt8\@: ++ add $4, %ecx // LEN - 4 ++ jl .Llt4\@ ++ ++ // Store 4 <= LEN <= 7 bytes. ++ vpextrd $1, \src, %eax ++ mov %ecx, \tmp32 ++ shl $3, %ecx ++ ror %cl, %eax ++ mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes ++ vmovd \src, (\dst) // Store first 4 bytes ++ jmp .Ldone\@ ++ ++.Llt4\@: ++ // Store 1 <= LEN <= 3 bytes. ++ vpextrb $0, \src, 0(\dst) ++ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? ++ jl .Ldone\@ ++ vpextrb $1, \src, 1(\dst) ++ je .Ldone\@ ++ vpextrb $2, \src, 2(\dst) ++.Ldone\@: ++.endm ++ ++// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and ++// XOR each with the zero-th round key. Also update LE_CTR if !\final. ++.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 ++.if \is_xctr ++ .if USE_AVX10 ++ _vmovdqa LE_CTR, AESDATA\i0 ++ vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 ++ .else ++ vpxor XCTR_IV, LE_CTR, AESDATA\i0 ++ vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 ++ .endif ++ vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 ++ ++ .if USE_AVX10 ++ vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 ++ .else ++ vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 ++ vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 ++ .endif ++.else ++ vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0 ++ _vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 ++ vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 ++ vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1 ++ _vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 ++.endif ++.if !\final ++ vpaddq LE_CTR_INC2, LE_CTR, LE_CTR ++.endif ++.endm ++ ++// Do all AES rounds on the data in the given AESDATA vectors, excluding the ++// zero-th and last rounds. ++.macro _aesenc_loop vecs:vararg ++ mov KEY, %rax ++1: ++ _vbroadcast128 (%rax), RNDKEY ++.irp i, \vecs ++ vaesenc RNDKEY, AESDATA\i, AESDATA\i ++.endr ++ add $16, %rax ++ cmp %rax, RNDKEYLAST_PTR ++ jne 1b ++.endm ++ ++// Finalize the keystream blocks in the given AESDATA vectors by doing the last ++// AES round, then XOR those keystream blocks with the corresponding data. ++// Reduce latency by doing the XOR before the vaesenclast, utilizing the ++// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). ++.macro _aesenclast_and_xor vecs:vararg ++.irp i, \vecs ++ _vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY ++ vaesenclast RNDKEY, AESDATA\i, AESDATA\i ++.endr ++.irp i, \vecs ++ _vmovdqu AESDATA\i, \i*VL(DST) ++.endr ++.endm ++ ++// XOR the keystream blocks in the specified AESDATA vectors with the ++// corresponding data. ++.macro _xor_data vecs:vararg ++.irp i, \vecs ++ _vpxor \i*VL(SRC), AESDATA\i, AESDATA\i ++.endr ++.irp i, \vecs ++ _vmovdqu AESDATA\i, \i*VL(DST) ++.endr ++.endm ++ ++.macro _aes_ctr_crypt is_xctr ++ ++ // Define register aliases V0-V15 that map to the xmm, ymm, or zmm ++ // registers according to the selected Vector Length (VL). ++.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 ++ .if VL == 16 ++ .set V\i, %xmm\i ++ .elseif VL == 32 ++ .set V\i, %ymm\i ++ .elseif VL == 64 ++ .set V\i, %zmm\i ++ .else ++ .error "Unsupported Vector Length (VL)" ++ .endif ++.endr ++ ++ // Function arguments ++ .set KEY, %rdi // Initially points to the start of the ++ // crypto_aes_ctx, then is advanced to ++ // point to the index 1 round key ++ .set KEY32, %edi // Available as temp register after all ++ // keystream blocks have been generated ++ .set SRC, %rsi // Pointer to next source data ++ .set DST, %rdx // Pointer to next destination data ++ .set LEN, %ecx // Remaining length in bytes. ++ // Note: _load_partial_block relies on ++ // this being in %ecx. ++ .set LEN64, %rcx // Zero-extend LEN before using! ++ .set LEN8, %cl ++.if \is_xctr ++ .set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE]; ++ .set XCTR_CTR, %r9 // u64 ctr; ++.else ++ .set LE_CTR_PTR, %r8 // const u64 le_ctr[2]; ++.endif ++ ++ // Additional local variables ++ .set RNDKEYLAST_PTR, %r10 ++ .set AESDATA0, V0 ++ .set AESDATA0_XMM, %xmm0 ++ .set AESDATA1, V1 ++ .set AESDATA1_XMM, %xmm1 ++ .set AESDATA2, V2 ++ .set AESDATA3, V3 ++ .set AESDATA4, V4 ++ .set AESDATA5, V5 ++ .set AESDATA6, V6 ++ .set AESDATA7, V7 ++.if \is_xctr ++ .set XCTR_IV, V8 ++.else ++ .set BSWAP_MASK, V8 ++.endif ++ .set LE_CTR, V9 ++ .set LE_CTR_XMM, %xmm9 ++ .set LE_CTR_INC1, V10 ++ .set LE_CTR_INC2, V11 ++ .set RNDKEY0, V12 ++ .set RNDKEYLAST, V13 ++ .set RNDKEY, V14 ++ ++ // Create the first vector of counters. ++.if \is_xctr ++ .if VL == 16 ++ vmovq XCTR_CTR, LE_CTR ++ .elseif VL == 32 ++ vmovq XCTR_CTR, LE_CTR_XMM ++ inc XCTR_CTR ++ vmovq XCTR_CTR, AESDATA0_XMM ++ vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR ++ .else ++ vpbroadcastq XCTR_CTR, LE_CTR ++ vpsrldq $8, LE_CTR, LE_CTR ++ vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR ++ .endif ++ _vbroadcast128 (XCTR_IV_PTR), XCTR_IV ++.else ++ _vbroadcast128 (LE_CTR_PTR), LE_CTR ++ .if VL > 16 ++ vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR ++ .endif ++ _vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK ++.endif ++ ++.if VL == 16 ++ _vbroadcast128 .Lone(%rip), LE_CTR_INC1 ++.elseif VL == 32 ++ _vbroadcast128 .Ltwo(%rip), LE_CTR_INC1 ++.else ++ _vbroadcast128 .Lfour(%rip), LE_CTR_INC1 ++.endif ++ vpsllq $1, LE_CTR_INC1, LE_CTR_INC2 ++ ++ // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). ++ movl 480(KEY), %eax ++ ++ // Compute the pointer to the last round key. ++ lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR ++ ++ // Load the zero-th and last round keys. ++ _vbroadcast128 (KEY), RNDKEY0 ++ _vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST ++ ++ // Make KEY point to the first round key. ++ add $16, KEY ++ ++ // This is the main loop, which encrypts 8 vectors of data at a time. ++ add $-8*VL, LEN ++ jl .Lloop_8x_done\@ ++.Lloop_8x\@: ++ _prepare_2_ctr_vecs \is_xctr, 0, 1 ++ _prepare_2_ctr_vecs \is_xctr, 2, 3 ++ _prepare_2_ctr_vecs \is_xctr, 4, 5 ++ _prepare_2_ctr_vecs \is_xctr, 6, 7 ++ _aesenc_loop 0,1,2,3,4,5,6,7 ++ _aesenclast_and_xor 0,1,2,3,4,5,6,7 ++ sub $-8*VL, SRC ++ sub $-8*VL, DST ++ add $-8*VL, LEN ++ jge .Lloop_8x\@ ++.Lloop_8x_done\@: ++ sub $-8*VL, LEN ++ jz .Ldone\@ ++ ++ // 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream ++ // blocks, depending on the remaining LEN. ++ ++ _prepare_2_ctr_vecs \is_xctr, 0, 1 ++ _prepare_2_ctr_vecs \is_xctr, 2, 3 ++ cmp $4*VL, LEN ++ jle .Lenc_tail_atmost4vecs\@ ++ ++ // 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the ++ // first 4 to XOR 4 full vectors of data. Then XOR the remaining data. ++ _prepare_2_ctr_vecs \is_xctr, 4, 5 ++ _prepare_2_ctr_vecs \is_xctr, 6, 7, final=1 ++ _aesenc_loop 0,1,2,3,4,5,6,7 ++ _aesenclast_and_xor 0,1,2,3 ++ vaesenclast RNDKEYLAST, AESDATA4, AESDATA0 ++ vaesenclast RNDKEYLAST, AESDATA5, AESDATA1 ++ vaesenclast RNDKEYLAST, AESDATA6, AESDATA2 ++ vaesenclast RNDKEYLAST, AESDATA7, AESDATA3 ++ sub $-4*VL, SRC ++ sub $-4*VL, DST ++ add $-4*VL, LEN ++ cmp $1*VL-1, LEN ++ jle .Lxor_tail_partial_vec_0\@ ++ _xor_data 0 ++ cmp $2*VL-1, LEN ++ jle .Lxor_tail_partial_vec_1\@ ++ _xor_data 1 ++ cmp $3*VL-1, LEN ++ jle .Lxor_tail_partial_vec_2\@ ++ _xor_data 2 ++ cmp $4*VL-1, LEN ++ jle .Lxor_tail_partial_vec_3\@ ++ _xor_data 3 ++ jmp .Ldone\@ ++ ++.Lenc_tail_atmost4vecs\@: ++ cmp $2*VL, LEN ++ jle .Lenc_tail_atmost2vecs\@ ++ ++ // 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the ++ // first 2 to XOR 2 full vectors of data. Then XOR the remaining data. ++ _aesenc_loop 0,1,2,3 ++ _aesenclast_and_xor 0,1 ++ vaesenclast RNDKEYLAST, AESDATA2, AESDATA0 ++ vaesenclast RNDKEYLAST, AESDATA3, AESDATA1 ++ sub $-2*VL, SRC ++ sub $-2*VL, DST ++ add $-2*VL, LEN ++ jmp .Lxor_tail_upto2vecs\@ ++ ++.Lenc_tail_atmost2vecs\@: ++ // 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR ++ // the remaining data. ++ _aesenc_loop 0,1 ++ vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 ++ vaesenclast RNDKEYLAST, AESDATA1, AESDATA1 ++ ++.Lxor_tail_upto2vecs\@: ++ cmp $1*VL-1, LEN ++ jle .Lxor_tail_partial_vec_0\@ ++ _xor_data 0 ++ cmp $2*VL-1, LEN ++ jle .Lxor_tail_partial_vec_1\@ ++ _xor_data 1 ++ jmp .Ldone\@ ++ ++.Lxor_tail_partial_vec_1\@: ++ add $-1*VL, LEN ++ jz .Ldone\@ ++ sub $-1*VL, SRC ++ sub $-1*VL, DST ++ _vmovdqa AESDATA1, AESDATA0 ++ jmp .Lxor_tail_partial_vec_0\@ ++ ++.Lxor_tail_partial_vec_2\@: ++ add $-2*VL, LEN ++ jz .Ldone\@ ++ sub $-2*VL, SRC ++ sub $-2*VL, DST ++ _vmovdqa AESDATA2, AESDATA0 ++ jmp .Lxor_tail_partial_vec_0\@ ++ ++.Lxor_tail_partial_vec_3\@: ++ add $-3*VL, LEN ++ jz .Ldone\@ ++ sub $-3*VL, SRC ++ sub $-3*VL, DST ++ _vmovdqa AESDATA3, AESDATA0 ++ ++.Lxor_tail_partial_vec_0\@: ++ // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked ++ // loads/stores are available; otherwise it's a bit harder... ++.if USE_AVX10 ++ .if VL <= 32 ++ mov $-1, %eax ++ bzhi LEN, %eax, %eax ++ kmovd %eax, %k1 ++ .else ++ mov $-1, %rax ++ bzhi LEN64, %rax, %rax ++ kmovq %rax, %k1 ++ .endif ++ vmovdqu8 (SRC), AESDATA1{%k1}{z} ++ _vpxor AESDATA1, AESDATA0, AESDATA0 ++ vmovdqu8 AESDATA0, (DST){%k1} ++.else ++ .if VL == 32 ++ cmp $16, LEN ++ jl 1f ++ vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM ++ vmovdqu AESDATA1_XMM, (DST) ++ add $16, SRC ++ add $16, DST ++ sub $16, LEN ++ jz .Ldone\@ ++ vextracti128 $1, AESDATA0, AESDATA0_XMM ++1: ++ .endif ++ mov LEN, %r10d ++ _load_partial_block SRC, AESDATA1_XMM, KEY, KEY32 ++ vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM ++ mov %r10d, %ecx ++ _store_partial_block AESDATA0_XMM, DST, KEY, KEY32 ++.endif ++ ++.Ldone\@: ++.if VL > 16 ++ vzeroupper ++.endif ++ RET ++.endm ++ ++// Below are the definitions of the functions generated by the above macro. ++// They have the following prototypes: ++// ++// ++// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, ++// const u8 *src, u8 *dst, int len, ++// const u64 le_ctr[2]); ++// ++// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, ++// const u8 *src, u8 *dst, int len, ++// const u8 iv[AES_BLOCK_SIZE], u64 ctr); ++// ++// Both functions generate |len| bytes of keystream, XOR it with the data from ++// |src|, and write the result to |dst|. On non-final calls, |len| must be a ++// multiple of 16. On the final call, |len| can be any value. ++// ++// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated ++// from a 128-bit big endian counter that increments by 1 for each AES block. ++// HOWEVER, to keep the assembly code simple, some of the counter management is ++// left to the caller. aes_ctr64_crypt_* take the counter in little endian ++// form, only increment the low 64 bits internally, do the conversion to big ++// endian internally, and don't write the updated counter back to memory. The ++// caller is responsible for converting the starting IV to the little endian ++// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits ++// being needed and splitting at that point with a carry done in between, and ++// updating le_ctr after each part if the message is multi-part. ++// ++// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption ++// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an ++// easier-to-implement variant of CTR that uses little endian byte order and ++// eliminates carries. |ctr| is the per-message block counter starting at 1. ++ ++.set VL, 16 ++.set USE_AVX10, 0 ++SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) ++ _aes_ctr_crypt 0 ++SYM_FUNC_END(aes_ctr64_crypt_aesni_avx) ++SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx) ++ _aes_ctr_crypt 1 ++SYM_FUNC_END(aes_xctr_crypt_aesni_avx) ++ ++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) ++.set VL, 32 ++.set USE_AVX10, 0 ++SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) ++ _aes_ctr_crypt 0 ++SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) ++SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) ++ _aes_ctr_crypt 1 ++SYM_FUNC_END(aes_xctr_crypt_vaes_avx2) ++ ++.set VL, 32 ++.set USE_AVX10, 1 ++SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256) ++ _aes_ctr_crypt 0 ++SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256) ++SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256) ++ _aes_ctr_crypt 1 ++SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256) ++ ++.set VL, 64 ++.set USE_AVX10, 1 ++SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512) ++ _aes_ctr_crypt 0 ++SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512) ++SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512) ++ _aes_ctr_crypt 1 ++SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512) ++#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ +--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S ++++ /dev/null +@@ -1,597 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ +-/* +- * AES CTR mode by8 optimization with AVX instructions. (x86_64) +- * +- * Copyright(c) 2014 Intel Corporation. +- * +- * Contact Information: +- * James Guilford <james.guilford@intel.com> +- * Sean Gulley <sean.m.gulley@intel.com> +- * Chandramouli Narayanan <mouli@linux.intel.com> +- */ +-/* +- * This is AES128/192/256 CTR mode optimization implementation. It requires +- * the support of Intel(R) AESNI and AVX instructions. +- * +- * This work was inspired by the AES CTR mode optimization published +- * in Intel Optimized IPSEC Cryptographic library. +- * Additional information on it can be found at: +- * https://github.com/intel/intel-ipsec-mb +- */ +- +-#include <linux/linkage.h> +- +-#define VMOVDQ vmovdqu +- +-/* +- * Note: the "x" prefix in these aliases means "this is an xmm register". The +- * alias prefixes have no relation to XCTR where the "X" prefix means "XOR +- * counter". +- */ +-#define xdata0 %xmm0 +-#define xdata1 %xmm1 +-#define xdata2 %xmm2 +-#define xdata3 %xmm3 +-#define xdata4 %xmm4 +-#define xdata5 %xmm5 +-#define xdata6 %xmm6 +-#define xdata7 %xmm7 +-#define xcounter %xmm8 // CTR mode only +-#define xiv %xmm8 // XCTR mode only +-#define xbyteswap %xmm9 // CTR mode only +-#define xtmp %xmm9 // XCTR mode only +-#define xkey0 %xmm10 +-#define xkey4 %xmm11 +-#define xkey8 %xmm12 +-#define xkey12 %xmm13 +-#define xkeyA %xmm14 +-#define xkeyB %xmm15 +- +-#define p_in %rdi +-#define p_iv %rsi +-#define p_keys %rdx +-#define p_out %rcx +-#define num_bytes %r8 +-#define counter %r9 // XCTR mode only +-#define tmp %r10 +-#define DDQ_DATA 0 +-#define XDATA 1 +-#define KEY_128 1 +-#define KEY_192 2 +-#define KEY_256 3 +- +-.section .rodata +-.align 16 +- +-byteswap_const: +- .octa 0x000102030405060708090A0B0C0D0E0F +-ddq_low_msk: +- .octa 0x0000000000000000FFFFFFFFFFFFFFFF +-ddq_high_add_1: +- .octa 0x00000000000000010000000000000000 +-ddq_add_1: +- .octa 0x00000000000000000000000000000001 +-ddq_add_2: +- .octa 0x00000000000000000000000000000002 +-ddq_add_3: +- .octa 0x00000000000000000000000000000003 +-ddq_add_4: +- .octa 0x00000000000000000000000000000004 +-ddq_add_5: +- .octa 0x00000000000000000000000000000005 +-ddq_add_6: +- .octa 0x00000000000000000000000000000006 +-ddq_add_7: +- .octa 0x00000000000000000000000000000007 +-ddq_add_8: +- .octa 0x00000000000000000000000000000008 +- +-.text +- +-/* generate a unique variable for ddq_add_x */ +- +-/* generate a unique variable for xmm register */ +-.macro setxdata n +- var_xdata = %xmm\n +-.endm +- +-/* club the numeric 'id' to the symbol 'name' */ +- +-.macro club name, id +-.altmacro +- .if \name == XDATA +- setxdata %\id +- .endif +-.noaltmacro +-.endm +- +-/* +- * do_aes num_in_par load_keys key_len +- * This increments p_in, but not p_out +- */ +-.macro do_aes b, k, key_len, xctr +- .set by, \b +- .set load_keys, \k +- .set klen, \key_len +- +- .if (load_keys) +- vmovdqa 0*16(p_keys), xkey0 +- .endif +- +- .if \xctr +- movq counter, xtmp +- .set i, 0 +- .rept (by) +- club XDATA, i +- vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata +- .set i, (i +1) +- .endr +- .set i, 0 +- .rept (by) +- club XDATA, i +- vpxor xiv, var_xdata, var_xdata +- .set i, (i +1) +- .endr +- .else +- vpshufb xbyteswap, xcounter, xdata0 +- .set i, 1 +- .rept (by - 1) +- club XDATA, i +- vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata +- vptest ddq_low_msk(%rip), var_xdata +- jnz 1f +- vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata +- vpaddq ddq_high_add_1(%rip), xcounter, xcounter +- 1: +- vpshufb xbyteswap, var_xdata, var_xdata +- .set i, (i +1) +- .endr +- .endif +- +- vmovdqa 1*16(p_keys), xkeyA +- +- vpxor xkey0, xdata0, xdata0 +- .if \xctr +- add $by, counter +- .else +- vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter +- vptest ddq_low_msk(%rip), xcounter +- jnz 1f +- vpaddq ddq_high_add_1(%rip), xcounter, xcounter +- 1: +- .endif +- +- .set i, 1 +- .rept (by - 1) +- club XDATA, i +- vpxor xkey0, var_xdata, var_xdata +- .set i, (i +1) +- .endr +- +- vmovdqa 2*16(p_keys), xkeyB +- +- .set i, 0 +- .rept by +- club XDATA, i +- vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ +- .set i, (i +1) +- .endr +- +- .if (klen == KEY_128) +- .if (load_keys) +- vmovdqa 3*16(p_keys), xkey4 +- .endif +- .else +- vmovdqa 3*16(p_keys), xkeyA +- .endif +- +- .set i, 0 +- .rept by +- club XDATA, i +- vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ +- .set i, (i +1) +- .endr +- +- add $(16*by), p_in +- +- .if (klen == KEY_128) +- vmovdqa 4*16(p_keys), xkeyB +- .else +- .if (load_keys) +- vmovdqa 4*16(p_keys), xkey4 +- .endif +- .endif +- +- .set i, 0 +- .rept by +- club XDATA, i +- /* key 3 */ +- .if (klen == KEY_128) +- vaesenc xkey4, var_xdata, var_xdata +- .else +- vaesenc xkeyA, var_xdata, var_xdata +- .endif +- .set i, (i +1) +- .endr +- +- vmovdqa 5*16(p_keys), xkeyA +- +- .set i, 0 +- .rept by +- club XDATA, i +- /* key 4 */ +- .if (klen == KEY_128) +- vaesenc xkeyB, var_xdata, var_xdata +- .else +- vaesenc xkey4, var_xdata, var_xdata +- .endif +- .set i, (i +1) +- .endr +- +- .if (klen == KEY_128) +- .if (load_keys) +- vmovdqa 6*16(p_keys), xkey8 +- .endif +- .else +- vmovdqa 6*16(p_keys), xkeyB +- .endif +- +- .set i, 0 +- .rept by +- club XDATA, i +- vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ +- .set i, (i +1) +- .endr +- +- vmovdqa 7*16(p_keys), xkeyA +- +- .set i, 0 +- .rept by +- club XDATA, i +- /* key 6 */ +- .if (klen == KEY_128) +- vaesenc xkey8, var_xdata, var_xdata +- .else +- vaesenc xkeyB, var_xdata, var_xdata +- .endif +- .set i, (i +1) +- .endr +- +- .if (klen == KEY_128) +- vmovdqa 8*16(p_keys), xkeyB +- .else +- .if (load_keys) +- vmovdqa 8*16(p_keys), xkey8 +- .endif +- .endif +- +- .set i, 0 +- .rept by +- club XDATA, i +- vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ +- .set i, (i +1) +- .endr +- +- .if (klen == KEY_128) +- .if (load_keys) +- vmovdqa 9*16(p_keys), xkey12 +- .endif +- .else +- vmovdqa 9*16(p_keys), xkeyA +- .endif +- +- .set i, 0 +- .rept by +- club XDATA, i +- /* key 8 */ +- .if (klen == KEY_128) +- vaesenc xkeyB, var_xdata, var_xdata +- .else +- vaesenc xkey8, var_xdata, var_xdata +- .endif +- .set i, (i +1) +- .endr +- +- vmovdqa 10*16(p_keys), xkeyB +- +- .set i, 0 +- .rept by +- club XDATA, i +- /* key 9 */ +- .if (klen == KEY_128) +- vaesenc xkey12, var_xdata, var_xdata +- .else +- vaesenc xkeyA, var_xdata, var_xdata +- .endif +- .set i, (i +1) +- .endr +- +- .if (klen != KEY_128) +- vmovdqa 11*16(p_keys), xkeyA +- .endif +- +- .set i, 0 +- .rept by +- club XDATA, i +- /* key 10 */ +- .if (klen == KEY_128) +- vaesenclast xkeyB, var_xdata, var_xdata +- .else +- vaesenc xkeyB, var_xdata, var_xdata +- .endif +- .set i, (i +1) +- .endr +- +- .if (klen != KEY_128) +- .if (load_keys) +- vmovdqa 12*16(p_keys), xkey12 +- .endif +- +- .set i, 0 +- .rept by +- club XDATA, i +- vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ +- .set i, (i +1) +- .endr +- +- .if (klen == KEY_256) +- vmovdqa 13*16(p_keys), xkeyA +- .endif +- +- .set i, 0 +- .rept by +- club XDATA, i +- .if (klen == KEY_256) +- /* key 12 */ +- vaesenc xkey12, var_xdata, var_xdata +- .else +- vaesenclast xkey12, var_xdata, var_xdata +- .endif +- .set i, (i +1) +- .endr +- +- .if (klen == KEY_256) +- vmovdqa 14*16(p_keys), xkeyB +- +- .set i, 0 +- .rept by +- club XDATA, i +- /* key 13 */ +- vaesenc xkeyA, var_xdata, var_xdata +- .set i, (i +1) +- .endr +- +- .set i, 0 +- .rept by +- club XDATA, i +- /* key 14 */ +- vaesenclast xkeyB, var_xdata, var_xdata +- .set i, (i +1) +- .endr +- .endif +- .endif +- +- .set i, 0 +- .rept (by / 2) +- .set j, (i+1) +- VMOVDQ (i*16 - 16*by)(p_in), xkeyA +- VMOVDQ (j*16 - 16*by)(p_in), xkeyB +- club XDATA, i +- vpxor xkeyA, var_xdata, var_xdata +- club XDATA, j +- vpxor xkeyB, var_xdata, var_xdata +- .set i, (i+2) +- .endr +- +- .if (i < by) +- VMOVDQ (i*16 - 16*by)(p_in), xkeyA +- club XDATA, i +- vpxor xkeyA, var_xdata, var_xdata +- .endif +- +- .set i, 0 +- .rept by +- club XDATA, i +- VMOVDQ var_xdata, i*16(p_out) +- .set i, (i+1) +- .endr +-.endm +- +-.macro do_aes_load val, key_len, xctr +- do_aes \val, 1, \key_len, \xctr +-.endm +- +-.macro do_aes_noload val, key_len, xctr +- do_aes \val, 0, \key_len, \xctr +-.endm +- +-/* main body of aes ctr load */ +- +-.macro do_aes_ctrmain key_len, xctr +- cmp $16, num_bytes +- jb .Ldo_return2\xctr\key_len +- +- .if \xctr +- shr $4, counter +- vmovdqu (p_iv), xiv +- .else +- vmovdqa byteswap_const(%rip), xbyteswap +- vmovdqu (p_iv), xcounter +- vpshufb xbyteswap, xcounter, xcounter +- .endif +- +- mov num_bytes, tmp +- and $(7*16), tmp +- jz .Lmult_of_8_blks\xctr\key_len +- +- /* 1 <= tmp <= 7 */ +- cmp $(4*16), tmp +- jg .Lgt4\xctr\key_len +- je .Leq4\xctr\key_len +- +-.Llt4\xctr\key_len: +- cmp $(2*16), tmp +- jg .Leq3\xctr\key_len +- je .Leq2\xctr\key_len +- +-.Leq1\xctr\key_len: +- do_aes_load 1, \key_len, \xctr +- add $(1*16), p_out +- and $(~7*16), num_bytes +- jz .Ldo_return2\xctr\key_len +- jmp .Lmain_loop2\xctr\key_len +- +-.Leq2\xctr\key_len: +- do_aes_load 2, \key_len, \xctr +- add $(2*16), p_out +- and $(~7*16), num_bytes +- jz .Ldo_return2\xctr\key_len +- jmp .Lmain_loop2\xctr\key_len +- +- +-.Leq3\xctr\key_len: +- do_aes_load 3, \key_len, \xctr +- add $(3*16), p_out +- and $(~7*16), num_bytes +- jz .Ldo_return2\xctr\key_len +- jmp .Lmain_loop2\xctr\key_len +- +-.Leq4\xctr\key_len: +- do_aes_load 4, \key_len, \xctr +- add $(4*16), p_out +- and $(~7*16), num_bytes +- jz .Ldo_return2\xctr\key_len +- jmp .Lmain_loop2\xctr\key_len +- +-.Lgt4\xctr\key_len: +- cmp $(6*16), tmp +- jg .Leq7\xctr\key_len +- je .Leq6\xctr\key_len +- +-.Leq5\xctr\key_len: +- do_aes_load 5, \key_len, \xctr +- add $(5*16), p_out +- and $(~7*16), num_bytes +- jz .Ldo_return2\xctr\key_len +- jmp .Lmain_loop2\xctr\key_len +- +-.Leq6\xctr\key_len: +- do_aes_load 6, \key_len, \xctr +- add $(6*16), p_out +- and $(~7*16), num_bytes +- jz .Ldo_return2\xctr\key_len +- jmp .Lmain_loop2\xctr\key_len +- +-.Leq7\xctr\key_len: +- do_aes_load 7, \key_len, \xctr +- add $(7*16), p_out +- and $(~7*16), num_bytes +- jz .Ldo_return2\xctr\key_len +- jmp .Lmain_loop2\xctr\key_len +- +-.Lmult_of_8_blks\xctr\key_len: +- .if (\key_len != KEY_128) +- vmovdqa 0*16(p_keys), xkey0 +- vmovdqa 4*16(p_keys), xkey4 +- vmovdqa 8*16(p_keys), xkey8 +- vmovdqa 12*16(p_keys), xkey12 +- .else +- vmovdqa 0*16(p_keys), xkey0 +- vmovdqa 3*16(p_keys), xkey4 +- vmovdqa 6*16(p_keys), xkey8 +- vmovdqa 9*16(p_keys), xkey12 +- .endif +-.align 16 +-.Lmain_loop2\xctr\key_len: +- /* num_bytes is a multiple of 8 and >0 */ +- do_aes_noload 8, \key_len, \xctr +- add $(8*16), p_out +- sub $(8*16), num_bytes +- jne .Lmain_loop2\xctr\key_len +- +-.Ldo_return2\xctr\key_len: +- .if !\xctr +- /* return updated IV */ +- vpshufb xbyteswap, xcounter, xcounter +- vmovdqu xcounter, (p_iv) +- .endif +- RET +-.endm +- +-/* +- * routine to do AES128 CTR enc/decrypt "by8" +- * XMM registers are clobbered. +- * Saving/restoring must be done at a higher level +- * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, +- * unsigned int num_bytes) +- */ +-SYM_FUNC_START(aes_ctr_enc_128_avx_by8) +- /* call the aes main loop */ +- do_aes_ctrmain KEY_128 0 +- +-SYM_FUNC_END(aes_ctr_enc_128_avx_by8) +- +-/* +- * routine to do AES192 CTR enc/decrypt "by8" +- * XMM registers are clobbered. +- * Saving/restoring must be done at a higher level +- * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, +- * unsigned int num_bytes) +- */ +-SYM_FUNC_START(aes_ctr_enc_192_avx_by8) +- /* call the aes main loop */ +- do_aes_ctrmain KEY_192 0 +- +-SYM_FUNC_END(aes_ctr_enc_192_avx_by8) +- +-/* +- * routine to do AES256 CTR enc/decrypt "by8" +- * XMM registers are clobbered. +- * Saving/restoring must be done at a higher level +- * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, +- * unsigned int num_bytes) +- */ +-SYM_FUNC_START(aes_ctr_enc_256_avx_by8) +- /* call the aes main loop */ +- do_aes_ctrmain KEY_256 0 +- +-SYM_FUNC_END(aes_ctr_enc_256_avx_by8) +- +-/* +- * routine to do AES128 XCTR enc/decrypt "by8" +- * XMM registers are clobbered. +- * Saving/restoring must be done at a higher level +- * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, +- * u8* out, unsigned int num_bytes, unsigned int byte_ctr) +- */ +-SYM_FUNC_START(aes_xctr_enc_128_avx_by8) +- /* call the aes main loop */ +- do_aes_ctrmain KEY_128 1 +- +-SYM_FUNC_END(aes_xctr_enc_128_avx_by8) +- +-/* +- * routine to do AES192 XCTR enc/decrypt "by8" +- * XMM registers are clobbered. +- * Saving/restoring must be done at a higher level +- * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, +- * u8* out, unsigned int num_bytes, unsigned int byte_ctr) +- */ +-SYM_FUNC_START(aes_xctr_enc_192_avx_by8) +- /* call the aes main loop */ +- do_aes_ctrmain KEY_192 1 +- +-SYM_FUNC_END(aes_xctr_enc_192_avx_by8) +- +-/* +- * routine to do AES256 XCTR enc/decrypt "by8" +- * XMM registers are clobbered. +- * Saving/restoring must be done at a higher level +- * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, +- * u8* out, unsigned int num_bytes, unsigned int byte_ctr) +- */ +-SYM_FUNC_START(aes_xctr_enc_256_avx_by8) +- /* call the aes main loop */ +- do_aes_ctrmain KEY_256 1 +- +-SYM_FUNC_END(aes_xctr_enc_256_avx_by8) +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -23,7 +23,6 @@ + #include <linux/err.h> + #include <crypto/algapi.h> + #include <crypto/aes.h> +-#include <crypto/ctr.h> + #include <crypto/b128ops.h> + #include <crypto/gcm.h> + #include <crypto/xts.h> +@@ -82,30 +81,8 @@ asmlinkage void aesni_xts_dec(const stru + const u8 *in, unsigned int len, u8 *iv); + + #ifdef CONFIG_X86_64 +- + asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); +-DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); +- +-asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, +- void *keys, u8 *out, unsigned int num_bytes); +-asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, +- void *keys, u8 *out, unsigned int num_bytes); +-asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, +- void *keys, u8 *out, unsigned int num_bytes); +- +- +-asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, +- const void *keys, u8 *out, unsigned int num_bytes, +- unsigned int byte_ctr); +- +-asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, +- const void *keys, u8 *out, unsigned int num_bytes, +- unsigned int byte_ctr); +- +-asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, +- const void *keys, u8 *out, unsigned int num_bytes, +- unsigned int byte_ctr); + #endif + + static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) +@@ -376,24 +353,8 @@ static int cts_cbc_decrypt(struct skciph + } + + #ifdef CONFIG_X86_64 +-static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, +- const u8 *in, unsigned int len, u8 *iv) +-{ +- /* +- * based on key length, override with the by8 version +- * of ctr mode encryption/decryption for improved performance +- * aes_set_key_common() ensures that key length is one of +- * {128,192,256} +- */ +- if (ctx->key_length == AES_KEYSIZE_128) +- aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len); +- else if (ctx->key_length == AES_KEYSIZE_192) +- aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len); +- else +- aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len); +-} +- +-static int ctr_crypt(struct skcipher_request *req) ++/* This is the non-AVX version. */ ++static int ctr_crypt_aesni(struct skcipher_request *req) + { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); +@@ -407,10 +368,9 @@ static int ctr_crypt(struct skcipher_req + while ((nbytes = walk.nbytes) > 0) { + kernel_fpu_begin(); + if (nbytes & AES_BLOCK_MASK) +- static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, +- walk.src.virt.addr, +- nbytes & AES_BLOCK_MASK, +- walk.iv); ++ aesni_ctr_enc(ctx, walk.dst.virt.addr, ++ walk.src.virt.addr, ++ nbytes & AES_BLOCK_MASK, walk.iv); + nbytes &= ~AES_BLOCK_MASK; + + if (walk.nbytes == walk.total && nbytes > 0) { +@@ -426,59 +386,6 @@ static int ctr_crypt(struct skcipher_req + } + return err; + } +- +-static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, +- const u8 *in, unsigned int len, u8 *iv, +- unsigned int byte_ctr) +-{ +- if (ctx->key_length == AES_KEYSIZE_128) +- aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, +- byte_ctr); +- else if (ctx->key_length == AES_KEYSIZE_192) +- aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, +- byte_ctr); +- else +- aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, +- byte_ctr); +-} +- +-static int xctr_crypt(struct skcipher_request *req) +-{ +- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); +- struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); +- u8 keystream[AES_BLOCK_SIZE]; +- struct skcipher_walk walk; +- unsigned int nbytes; +- unsigned int byte_ctr = 0; +- int err; +- __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; +- +- err = skcipher_walk_virt(&walk, req, false); +- +- while ((nbytes = walk.nbytes) > 0) { +- kernel_fpu_begin(); +- if (nbytes & AES_BLOCK_MASK) +- aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, +- walk.src.virt.addr, nbytes & AES_BLOCK_MASK, +- walk.iv, byte_ctr); +- nbytes &= ~AES_BLOCK_MASK; +- byte_ctr += walk.nbytes - nbytes; +- +- if (walk.nbytes == walk.total && nbytes > 0) { +- memcpy(block, walk.iv, AES_BLOCK_SIZE); +- block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); +- aesni_enc(ctx, keystream, (u8 *)block); +- crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - +- nbytes, walk.src.virt.addr + walk.nbytes +- - nbytes, keystream, nbytes); +- byte_ctr += nbytes; +- nbytes = 0; +- } +- kernel_fpu_end(); +- err = skcipher_walk_done(&walk, nbytes); +- } +- return err; +-} + #endif + + static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, +@@ -721,8 +628,8 @@ static struct skcipher_alg aesni_skciphe + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, +- .encrypt = ctr_crypt, +- .decrypt = ctr_crypt, ++ .encrypt = ctr_crypt_aesni, ++ .decrypt = ctr_crypt_aesni, + #endif + }, { + .base = { +@@ -748,35 +655,105 @@ static + struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; + + #ifdef CONFIG_X86_64 +-/* +- * XCTR does not have a non-AVX implementation, so it must be enabled +- * conditionally. +- */ +-static struct skcipher_alg aesni_xctr = { +- .base = { +- .cra_name = "__xctr(aes)", +- .cra_driver_name = "__xctr-aes-aesni", +- .cra_priority = 400, +- .cra_flags = CRYPTO_ALG_INTERNAL, +- .cra_blocksize = 1, +- .cra_ctxsize = CRYPTO_AES_CTX_SIZE, +- .cra_module = THIS_MODULE, +- }, +- .min_keysize = AES_MIN_KEY_SIZE, +- .max_keysize = AES_MAX_KEY_SIZE, +- .ivsize = AES_BLOCK_SIZE, +- .chunksize = AES_BLOCK_SIZE, +- .setkey = aesni_skcipher_setkey, +- .encrypt = xctr_crypt, +- .decrypt = xctr_crypt, +-}; +- +-static struct simd_skcipher_alg *aesni_simd_xctr; +- + asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]); + +-#define DEFINE_XTS_ALG(suffix, driver_name, priority) \ ++/* __always_inline to avoid indirect call */ ++static __always_inline int ++ctr_crypt(struct skcipher_request *req, ++ void (*ctr64_func)(const struct crypto_aes_ctx *key, ++ const u8 *src, u8 *dst, int len, ++ const u64 le_ctr[2])) ++{ ++ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); ++ const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); ++ unsigned int nbytes, p1_nbytes, nblocks; ++ struct skcipher_walk walk; ++ u64 le_ctr[2]; ++ u64 ctr64; ++ int err; ++ ++ ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]); ++ le_ctr[1] = get_unaligned_be64(&req->iv[0]); ++ ++ err = skcipher_walk_virt(&walk, req, false); ++ ++ while ((nbytes = walk.nbytes) != 0) { ++ if (nbytes < walk.total) { ++ /* Not the end yet, so keep the length block-aligned. */ ++ nbytes = round_down(nbytes, AES_BLOCK_SIZE); ++ nblocks = nbytes / AES_BLOCK_SIZE; ++ } else { ++ /* It's the end, so include any final partial block. */ ++ nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); ++ } ++ ctr64 += nblocks; ++ ++ kernel_fpu_begin(); ++ if (likely(ctr64 >= nblocks)) { ++ /* The low 64 bits of the counter won't overflow. */ ++ (*ctr64_func)(key, walk.src.virt.addr, ++ walk.dst.virt.addr, nbytes, le_ctr); ++ } else { ++ /* ++ * The low 64 bits of the counter will overflow. The ++ * assembly doesn't handle this case, so split the ++ * operation into two at the point where the overflow ++ * will occur. After the first part, add the carry bit. ++ */ ++ p1_nbytes = min_t(unsigned int, nbytes, ++ (nblocks - ctr64) * AES_BLOCK_SIZE); ++ (*ctr64_func)(key, walk.src.virt.addr, ++ walk.dst.virt.addr, p1_nbytes, le_ctr); ++ le_ctr[0] = 0; ++ le_ctr[1]++; ++ (*ctr64_func)(key, walk.src.virt.addr + p1_nbytes, ++ walk.dst.virt.addr + p1_nbytes, ++ nbytes - p1_nbytes, le_ctr); ++ } ++ kernel_fpu_end(); ++ le_ctr[0] = ctr64; ++ ++ err = skcipher_walk_done(&walk, walk.nbytes - nbytes); ++ } ++ ++ put_unaligned_be64(ctr64, &req->iv[8]); ++ put_unaligned_be64(le_ctr[1], &req->iv[0]); ++ ++ return err; ++} ++ ++/* __always_inline to avoid indirect call */ ++static __always_inline int ++xctr_crypt(struct skcipher_request *req, ++ void (*xctr_func)(const struct crypto_aes_ctx *key, ++ const u8 *src, u8 *dst, int len, ++ const u8 iv[AES_BLOCK_SIZE], u64 ctr)) ++{ ++ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); ++ const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); ++ struct skcipher_walk walk; ++ unsigned int nbytes; ++ u64 ctr = 1; ++ int err; ++ ++ err = skcipher_walk_virt(&walk, req, false); ++ while ((nbytes = walk.nbytes) != 0) { ++ if (nbytes < walk.total) ++ nbytes = round_down(nbytes, AES_BLOCK_SIZE); ++ ++ kernel_fpu_begin(); ++ (*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr, ++ nbytes, req->iv, ctr); ++ kernel_fpu_end(); ++ ++ ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); ++ err = skcipher_walk_done(&walk, walk.nbytes - nbytes); ++ } ++ return err; ++} ++ ++#define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority) \ + \ + asmlinkage void \ + aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ +@@ -795,32 +772,80 @@ static int xts_decrypt_##suffix(struct s + return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \ + } \ + \ +-static struct skcipher_alg aes_xts_alg_##suffix = { \ +- .base = { \ +- .cra_name = "__xts(aes)", \ +- .cra_driver_name = "__" driver_name, \ +- .cra_priority = priority, \ +- .cra_flags = CRYPTO_ALG_INTERNAL, \ +- .cra_blocksize = AES_BLOCK_SIZE, \ +- .cra_ctxsize = XTS_AES_CTX_SIZE, \ +- .cra_module = THIS_MODULE, \ +- }, \ +- .min_keysize = 2 * AES_MIN_KEY_SIZE, \ +- .max_keysize = 2 * AES_MAX_KEY_SIZE, \ +- .ivsize = AES_BLOCK_SIZE, \ +- .walksize = 2 * AES_BLOCK_SIZE, \ +- .setkey = xts_setkey_aesni, \ +- .encrypt = xts_encrypt_##suffix, \ +- .decrypt = xts_decrypt_##suffix, \ +-}; \ ++asmlinkage void \ ++aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, \ ++ const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\ ++ \ ++static int ctr_crypt_##suffix(struct skcipher_request *req) \ ++{ \ ++ return ctr_crypt(req, aes_ctr64_crypt_##suffix); \ ++} \ ++ \ ++asmlinkage void \ ++aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, \ ++ const u8 *src, u8 *dst, int len, \ ++ const u8 iv[AES_BLOCK_SIZE], u64 ctr); \ + \ +-static struct simd_skcipher_alg *aes_xts_simdalg_##suffix ++static int xctr_crypt_##suffix(struct skcipher_request *req) \ ++{ \ ++ return xctr_crypt(req, aes_xctr_crypt_##suffix); \ ++} \ ++ \ ++static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ ++ .base.cra_name = "__xts(aes)", \ ++ .base.cra_driver_name = "__xts-aes-" driver_name_suffix, \ ++ .base.cra_priority = priority, \ ++ .base.cra_flags = CRYPTO_ALG_INTERNAL, \ ++ .base.cra_blocksize = AES_BLOCK_SIZE, \ ++ .base.cra_ctxsize = XTS_AES_CTX_SIZE, \ ++ .base.cra_module = THIS_MODULE, \ ++ .min_keysize = 2 * AES_MIN_KEY_SIZE, \ ++ .max_keysize = 2 * AES_MAX_KEY_SIZE, \ ++ .ivsize = AES_BLOCK_SIZE, \ ++ .walksize = 2 * AES_BLOCK_SIZE, \ ++ .setkey = xts_setkey_aesni, \ ++ .encrypt = xts_encrypt_##suffix, \ ++ .decrypt = xts_decrypt_##suffix, \ ++}, { \ ++ .base.cra_name = "__ctr(aes)", \ ++ .base.cra_driver_name = "__ctr-aes-" driver_name_suffix, \ ++ .base.cra_priority = priority, \ ++ .base.cra_flags = CRYPTO_ALG_INTERNAL, \ ++ .base.cra_blocksize = 1, \ ++ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ ++ .base.cra_module = THIS_MODULE, \ ++ .min_keysize = AES_MIN_KEY_SIZE, \ ++ .max_keysize = AES_MAX_KEY_SIZE, \ ++ .ivsize = AES_BLOCK_SIZE, \ ++ .chunksize = AES_BLOCK_SIZE, \ ++ .setkey = aesni_skcipher_setkey, \ ++ .encrypt = ctr_crypt_##suffix, \ ++ .decrypt = ctr_crypt_##suffix, \ ++}, { \ ++ .base.cra_name = "__xctr(aes)", \ ++ .base.cra_driver_name = "__xctr-aes-" driver_name_suffix, \ ++ .base.cra_priority = priority, \ ++ .base.cra_flags = CRYPTO_ALG_INTERNAL, \ ++ .base.cra_blocksize = 1, \ ++ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ ++ .base.cra_module = THIS_MODULE, \ ++ .min_keysize = AES_MIN_KEY_SIZE, \ ++ .max_keysize = AES_MAX_KEY_SIZE, \ ++ .ivsize = AES_BLOCK_SIZE, \ ++ .chunksize = AES_BLOCK_SIZE, \ ++ .setkey = aesni_skcipher_setkey, \ ++ .encrypt = xctr_crypt_##suffix, \ ++ .decrypt = xctr_crypt_##suffix, \ ++}}; \ ++ \ ++static struct simd_skcipher_alg * \ ++simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)] + +-DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500); ++DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500); + #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +-DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600); +-DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); +-DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); ++DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600); ++DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700); ++DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800); + #endif + + /* The common part of the x86_64 AES-GCM key struct */ +@@ -1552,8 +1577,9 @@ static int __init register_avx_algs(void + + if (!boot_cpu_has(X86_FEATURE_AVX)) + return 0; +- err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1, +- &aes_xts_simdalg_aesni_avx); ++ err = simd_register_skciphers_compat(skcipher_algs_aesni_avx, ++ ARRAY_SIZE(skcipher_algs_aesni_avx), ++ simd_skcipher_algs_aesni_avx); + if (err) + return err; + err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, +@@ -1561,6 +1587,12 @@ static int __init register_avx_algs(void + aes_gcm_simdalgs_aesni_avx); + if (err) + return err; ++ /* ++ * Note: not all the algorithms registered below actually require ++ * VPCLMULQDQ. But in practice every CPU with VAES also has VPCLMULQDQ. ++ * Similarly, the assembler support was added at about the same time. ++ * For simplicity, just always check for VAES and VPCLMULQDQ together. ++ */ + #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_VAES) || +@@ -1568,8 +1600,9 @@ static int __init register_avx_algs(void + !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + return 0; +- err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1, +- &aes_xts_simdalg_vaes_avx2); ++ err = simd_register_skciphers_compat(skcipher_algs_vaes_avx2, ++ ARRAY_SIZE(skcipher_algs_vaes_avx2), ++ simd_skcipher_algs_vaes_avx2); + if (err) + return err; + +@@ -1580,8 +1613,9 @@ static int __init register_avx_algs(void + XFEATURE_MASK_AVX512, NULL)) + return 0; + +- err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1, +- &aes_xts_simdalg_vaes_avx10_256); ++ err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256, ++ ARRAY_SIZE(skcipher_algs_vaes_avx10_256), ++ simd_skcipher_algs_vaes_avx10_256); + if (err) + return err; + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, +@@ -1593,13 +1627,15 @@ static int __init register_avx_algs(void + if (x86_match_cpu(zmm_exclusion_list)) { + int i; + +- aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; ++ for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++) ++ skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1; + for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) + aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; + } + +- err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, +- &aes_xts_simdalg_vaes_avx10_512); ++ err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512, ++ ARRAY_SIZE(skcipher_algs_vaes_avx10_512), ++ simd_skcipher_algs_vaes_avx10_512); + if (err) + return err; + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, +@@ -1613,27 +1649,31 @@ static int __init register_avx_algs(void + + static void unregister_avx_algs(void) + { +- if (aes_xts_simdalg_aesni_avx) +- simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, +- &aes_xts_simdalg_aesni_avx); ++ if (simd_skcipher_algs_aesni_avx[0]) ++ simd_unregister_skciphers(skcipher_algs_aesni_avx, ++ ARRAY_SIZE(skcipher_algs_aesni_avx), ++ simd_skcipher_algs_aesni_avx); + if (aes_gcm_simdalgs_aesni_avx[0]) + simd_unregister_aeads(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx), + aes_gcm_simdalgs_aesni_avx); + #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +- if (aes_xts_simdalg_vaes_avx2) +- simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, +- &aes_xts_simdalg_vaes_avx2); +- if (aes_xts_simdalg_vaes_avx10_256) +- simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, +- &aes_xts_simdalg_vaes_avx10_256); ++ if (simd_skcipher_algs_vaes_avx2[0]) ++ simd_unregister_skciphers(skcipher_algs_vaes_avx2, ++ ARRAY_SIZE(skcipher_algs_vaes_avx2), ++ simd_skcipher_algs_vaes_avx2); ++ if (simd_skcipher_algs_vaes_avx10_256[0]) ++ simd_unregister_skciphers(skcipher_algs_vaes_avx10_256, ++ ARRAY_SIZE(skcipher_algs_vaes_avx10_256), ++ simd_skcipher_algs_vaes_avx10_256); + if (aes_gcm_simdalgs_vaes_avx10_256[0]) + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), + aes_gcm_simdalgs_vaes_avx10_256); +- if (aes_xts_simdalg_vaes_avx10_512) +- simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, +- &aes_xts_simdalg_vaes_avx10_512); ++ if (simd_skcipher_algs_vaes_avx10_512[0]) ++ simd_unregister_skciphers(skcipher_algs_vaes_avx10_512, ++ ARRAY_SIZE(skcipher_algs_vaes_avx10_512), ++ simd_skcipher_algs_vaes_avx10_512); + if (aes_gcm_simdalgs_vaes_avx10_512[0]) + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), +@@ -1666,13 +1706,6 @@ static int __init aesni_init(void) + + if (!x86_match_cpu(aesni_cpu_id)) + return -ENODEV; +-#ifdef CONFIG_X86_64 +- if (boot_cpu_has(X86_FEATURE_AVX)) { +- /* optimize performance of ctr mode encryption transform */ +- static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); +- pr_info("AES CTR mode by8 optimization enabled\n"); +- } +-#endif /* CONFIG_X86_64 */ + + err = crypto_register_alg(&aesni_cipher_alg); + if (err) +@@ -1690,14 +1723,6 @@ static int __init aesni_init(void) + if (err) + goto unregister_skciphers; + +-#ifdef CONFIG_X86_64 +- if (boot_cpu_has(X86_FEATURE_AVX)) +- err = simd_register_skciphers_compat(&aesni_xctr, 1, +- &aesni_simd_xctr); +- if (err) +- goto unregister_aeads; +-#endif /* CONFIG_X86_64 */ +- + err = register_avx_algs(); + if (err) + goto unregister_avx; +@@ -1706,11 +1731,6 @@ static int __init aesni_init(void) + + unregister_avx: + unregister_avx_algs(); +-#ifdef CONFIG_X86_64 +- if (aesni_simd_xctr) +- simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); +-unregister_aeads: +-#endif /* CONFIG_X86_64 */ + simd_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); +@@ -1730,10 +1750,6 @@ static void __exit aesni_exit(void) + simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), + aesni_simd_skciphers); + crypto_unregister_alg(&aesni_cipher_alg); +-#ifdef CONFIG_X86_64 +- if (boot_cpu_has(X86_FEATURE_AVX)) +- simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); +-#endif /* CONFIG_X86_64 */ + unregister_avx_algs(); + } + diff --git a/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch b/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch deleted file mode 100644 index 422a48f..0000000 --- a/debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch +++ /dev/null @@ -1,187 +0,0 @@ -From 3ed4205afe9305d71d055554ba27e7b8923865dc Mon Sep 17 00:00:00 2001 -From: Eric Biggers <ebiggers@google.com> -Date: Sun, 13 Oct 2024 21:06:49 -0700 -Subject: crypto: x86/crc32c - access 32-bit arguments as 32-bit - -Fix crc32c-pcl-intel-asm_64.S to access 32-bit arguments as 32-bit -values instead of 64-bit, since the upper bits of the corresponding -64-bit registers are not guaranteed to be zero. Also update the type of -the length argument to be unsigned int rather than int, as the assembly -code treats it as unsigned. - -Note: there haven't been any reports of this bug actually causing -incorrect behavior. Neither gcc nor clang guarantee zero-extension to -64 bits, but zero-extension is likely to happen in practice because most -instructions that operate on 32-bit registers zero-extend to 64 bits. - -Signed-off-by: Eric Biggers <ebiggers@google.com> ---- - arch/x86/crypto/crc32c-intel_glue.c | 2 +- - arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 57 +++++++++++------------ - 2 files changed, 27 insertions(+), 32 deletions(-) - ---- a/arch/x86/crypto/crc32c-intel_glue.c -+++ b/arch/x86/crypto/crc32c-intel_glue.c -@@ -41,7 +41,7 @@ - */ - #define CRC32C_PCL_BREAKEVEN 512 - --asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, -+asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len, - unsigned int crc_init); - #endif /* CONFIG_X86_64 */ - ---- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S -+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S -@@ -60,7 +60,7 @@ - # regular CRC code that does not interleave the CRC instructions. - #define SMALL_SIZE 200 - --# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); -+# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); - - .text - SYM_FUNC_START(crc_pcl) -@@ -72,14 +72,11 @@ SYM_FUNC_START(crc_pcl) - #define block_0 %rcx - #define block_1 %rdx - #define block_2 %r11 --#define len %rsi --#define len_dw %esi --#define len_w %si --#define len_b %sil --#define crc_init_arg %rdx -+#define len %esi -+#define crc_init_arg %edx - #define tmp %rbx --#define crc_init %r8 --#define crc_init_dw %r8d -+#define crc_init %r8d -+#define crc_init_q %r8 - #define crc1 %r9 - #define crc2 %r10 - -@@ -107,9 +104,9 @@ SYM_FUNC_START(crc_pcl) - movq (bufptmp), tmp # load a quadward from the buffer - add %bufp, bufptmp # align buffer pointer for quadword - # processing -- sub %bufp, len # update buffer length -+ sub bufp_dw, len # update buffer length - .Lalign_loop: -- crc32b %bl, crc_init_dw # compute crc32 of 1-byte -+ crc32b %bl, crc_init # compute crc32 of 1-byte - shr $8, tmp # get next byte - dec %bufp - jne .Lalign_loop -@@ -121,15 +118,14 @@ SYM_FUNC_START(crc_pcl) - ################################################################ - - ## compute num of bytes to be processed -- movq len, tmp # save num bytes in tmp - -- cmpq $128*24, len -+ cmp $128*24, len - jae .Lfull_block - - .Lcontinue_block: - ## len < 128*24 - movq $2731, %rax # 2731 = ceil(2^16 / 24) -- mul len_dw -+ mul len - shrq $16, %rax - - ## eax contains floor(bytes / 24) = num 24-byte chunks to do -@@ -176,7 +172,7 @@ SYM_FUNC_START(crc_pcl) - LABEL crc_ %i - .noaltmacro - ENDBR -- crc32q -i*8(block_0), crc_init -+ crc32q -i*8(block_0), crc_init_q - crc32q -i*8(block_1), crc1 - crc32q -i*8(block_2), crc2 - i=(i-1) -@@ -186,7 +182,7 @@ LABEL crc_ %i - LABEL crc_ %i - .noaltmacro - ENDBR -- crc32q -i*8(block_0), crc_init -+ crc32q -i*8(block_0), crc_init_q - crc32q -i*8(block_1), crc1 - # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet - -@@ -200,9 +196,9 @@ LABEL crc_ %i - shlq $3, %rax # rax *= 8 - pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 - leal (%eax,%eax,2), %eax # rax *= 3 (total *24) -- subq %rax, tmp # tmp -= rax*24 -+ sub %eax, len # len -= rax*24 - -- movq crc_init, %xmm1 # CRC for block 1 -+ movq crc_init_q, %xmm1 # CRC for block 1 - pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 - - movq crc1, %xmm2 # CRC for block 2 -@@ -211,8 +207,8 @@ LABEL crc_ %i - pxor %xmm2,%xmm1 - movq %xmm1, %rax - xor -i*8(block_2), %rax -- mov crc2, crc_init -- crc32 %rax, crc_init -+ mov crc2, crc_init_q -+ crc32 %rax, crc_init_q - - ################################################################ - ## 5) Check for end: -@@ -220,10 +216,9 @@ LABEL crc_ %i - - LABEL crc_ 0 - ENDBR -- mov tmp, len -- cmp $128*24, tmp -+ cmp $128*24, len - jae .Lfull_block -- cmp $SMALL_SIZE, tmp -+ cmp $SMALL_SIZE, len - jae .Lcontinue_block - - ####################################################################### -@@ -232,30 +227,30 @@ LABEL crc_ 0 - .Lsmall: - test len, len - jz .Ldone -- mov len_dw, %eax -+ mov len, %eax - shr $3, %eax - jz .Ldo_dword - .Ldo_qwords: -- crc32q (bufptmp), crc_init -+ crc32q (bufptmp), crc_init_q - add $8, bufptmp - dec %eax - jnz .Ldo_qwords - .Ldo_dword: -- test $4, len_dw -+ test $4, len - jz .Ldo_word -- crc32l (bufptmp), crc_init_dw -+ crc32l (bufptmp), crc_init - add $4, bufptmp - .Ldo_word: -- test $2, len_dw -+ test $2, len - jz .Ldo_byte -- crc32w (bufptmp), crc_init_dw -+ crc32w (bufptmp), crc_init - add $2, bufptmp - .Ldo_byte: -- test $1, len_dw -+ test $1, len - jz .Ldone -- crc32b (bufptmp), crc_init_dw -+ crc32b (bufptmp), crc_init - .Ldone: -- movq crc_init, %rax -+ mov crc_init, %eax - popq %rsi - popq %rdi - popq %rbx diff --git a/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch b/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch deleted file mode 100644 index 2ad2abe..0000000 --- a/debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch +++ /dev/null @@ -1,374 +0,0 @@ -From 5ffad9b234995f73548763a8487ecd256bba8d8d Mon Sep 17 00:00:00 2001 -From: Eric Biggers <ebiggers@google.com> -Date: Sun, 13 Oct 2024 21:06:49 -0700 -Subject: crypto: x86/crc32c - eliminate jump table and excessive unrolling - -crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully -unrolled and uses a jump table to jump into the correct location. This -optimization is misguided, as it bloats the binary code size and -introduces an indirect call. x86_64 CPUs can predict loops well, so it -is fine to just use a loop instead. Loop bookkeeping instructions can -compete with the crc instructions for the ALUs, but this is easily -mitigated by unrolling the loop by a smaller amount, such as 4 times. - -Therefore, re-roll the loop and make related tweaks to the code. - -This reduces the binary code size of crc_pclmul() from 4546 bytes to 418 -bytes, a 91% reduction. In general it also makes the code faster, with -some large improvements seen when retpoline is enabled. - -More detailed performance results are shown below. They are given as -percent improvement in throughput (negative means regressed) for CPU -microarchitecture vs. input length in bytes. E.g. an improvement from -40 GB/s to 50 GB/s would be listed as 25%. - -Table 1: Results with retpoline enabled (the default): - - | 512 | 833 | 1024 | 2000 | 3173 | 4096 | - ---------------------+-------+-------+-------+------ +-------+-------+ - Intel Haswell | 35.0% | 20.7% | 17.8% | 9.7% | -0.2% | 4.4% | - Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% | 0.0% | 5.4% | - AMD Zen 2 | 29.5% | 17.2% | 13.5% | 8.6% | -0.5% | 2.8% | - -Table 2: Results with retpoline disabled: - - | 512 | 833 | 1024 | 2000 | 3173 | 4096 | - ---------------------+-------+-------+-------+------ +-------+-------+ - Intel Haswell | 3.3% | 4.8% | 4.5% | 0.9% | -2.9% | 0.3% | - Intel Emerald Rapids | 7.5% | 6.4% | 5.2% | 2.3% | -0.0% | 0.6% | - AMD Zen 2 | 11.8% | 1.4% | 0.2% | 1.3% | -0.9% | -0.2% | - -Signed-off-by: Eric Biggers <ebiggers@google.com> ---- - arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 +++++++++------------- - 1 file changed, 92 insertions(+), 141 deletions(-) - ---- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S -+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S -@@ -7,6 +7,7 @@ - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf - * - * Copyright (C) 2012 Intel Corporation. -+ * Copyright 2024 Google LLC - * - * Authors: - * Wajdi Feghali <wajdi.k.feghali@intel.com> -@@ -44,18 +45,9 @@ - */ - - #include <linux/linkage.h> --#include <asm/nospec-branch.h> - - ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction - --.macro LABEL prefix n --.L\prefix\n\(): --.endm -- --.macro JMPTBL_ENTRY i --.quad .Lcrc_\i --.endm -- - # Define threshold below which buffers are considered "small" and routed to - # regular CRC code that does not interleave the CRC instructions. - #define SMALL_SIZE 200 -@@ -64,139 +56,116 @@ - - .text - SYM_FUNC_START(crc_pcl) --#define bufp rdi --#define bufp_dw %edi --#define bufp_w %di --#define bufp_b %dil --#define bufptmp %rcx --#define block_0 %rcx --#define block_1 %rdx --#define block_2 %r11 --#define len %esi --#define crc_init_arg %edx --#define tmp %rbx --#define crc_init %r8d --#define crc_init_q %r8 --#define crc1 %r9 --#define crc2 %r10 -- -- pushq %rbx -- pushq %rdi -- pushq %rsi -- -- ## Move crc_init for Linux to a different -- mov crc_init_arg, crc_init -+#define bufp %rdi -+#define bufp_d %edi -+#define len %esi -+#define crc_init %edx -+#define crc_init_q %rdx -+#define n_misaligned %ecx /* overlaps chunk_bytes! */ -+#define n_misaligned_q %rcx -+#define chunk_bytes %ecx /* overlaps n_misaligned! */ -+#define chunk_bytes_q %rcx -+#define crc1 %r8 -+#define crc2 %r9 - -- mov %bufp, bufptmp # rdi = *buf - cmp $SMALL_SIZE, len - jb .Lsmall - - ################################################################ - ## 1) ALIGN: - ################################################################ -- neg %bufp -- and $7, %bufp # calculate the unalignment amount of -+ mov bufp_d, n_misaligned -+ neg n_misaligned -+ and $7, n_misaligned # calculate the misalignment amount of - # the address -- je .Lproc_block # Skip if aligned -+ je .Laligned # Skip if aligned - -+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align -+ # the remaining data to an 8-byte boundary. - .Ldo_align: -- #### Calculate CRC of unaligned bytes of the buffer (if any) -- movq (bufptmp), tmp # load a quadward from the buffer -- add %bufp, bufptmp # align buffer pointer for quadword -- # processing -- sub bufp_dw, len # update buffer length -+ movq (bufp), %rax -+ add n_misaligned_q, bufp -+ sub n_misaligned, len - .Lalign_loop: -- crc32b %bl, crc_init # compute crc32 of 1-byte -- shr $8, tmp # get next byte -- dec %bufp -+ crc32b %al, crc_init # compute crc32 of 1-byte -+ shr $8, %rax # get next byte -+ dec n_misaligned - jne .Lalign_loop -- --.Lproc_block: -+.Laligned: - - ################################################################ -- ## 2) PROCESS BLOCKS: -+ ## 2) PROCESS BLOCK: - ################################################################ - -- ## compute num of bytes to be processed -- - cmp $128*24, len - jae .Lfull_block - --.Lcontinue_block: -- ## len < 128*24 -- movq $2731, %rax # 2731 = ceil(2^16 / 24) -- mul len -- shrq $16, %rax -- -- ## eax contains floor(bytes / 24) = num 24-byte chunks to do -- -- ## process rax 24-byte chunks (128 >= rax >= 0) -- -- ## compute end address of each block -- ## block 0 (base addr + RAX * 8) -- ## block 1 (base addr + RAX * 16) -- ## block 2 (base addr + RAX * 24) -- lea (bufptmp, %rax, 8), block_0 -- lea (block_0, %rax, 8), block_1 -- lea (block_1, %rax, 8), block_2 -- -- xor crc1, crc1 -- xor crc2, crc2 -- -- ## branch into array -- leaq jump_table(%rip), %bufp -- mov (%bufp,%rax,8), %bufp -- JMP_NOSPEC bufp -+.Lpartial_block: -+ # Compute floor(len / 24) to get num qwords to process from each lane. -+ imul $2731, len, %eax # 2731 = ceil(2^16 / 24) -+ shr $16, %eax -+ jmp .Lcrc_3lanes - -- ################################################################ -- ## 2a) PROCESS FULL BLOCKS: -- ################################################################ - .Lfull_block: -- movl $128,%eax -- lea 128*8*2(block_0), block_1 -- lea 128*8*3(block_0), block_2 -- add $128*8*1, block_0 -- -- xor crc1,crc1 -- xor crc2,crc2 -- -- # Fall through into top of crc array (crc_128) -+ # Processing 128 qwords from each lane. -+ mov $128, %eax - - ################################################################ -- ## 3) CRC Array: -+ ## 3) CRC each of three lanes: - ################################################################ - -- i=128 --.rept 128-1 --.altmacro --LABEL crc_ %i --.noaltmacro -- ENDBR -- crc32q -i*8(block_0), crc_init_q -- crc32q -i*8(block_1), crc1 -- crc32q -i*8(block_2), crc2 -- i=(i-1) --.endr -- --.altmacro --LABEL crc_ %i --.noaltmacro -- ENDBR -- crc32q -i*8(block_0), crc_init_q -- crc32q -i*8(block_1), crc1 --# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet -+.Lcrc_3lanes: -+ xor crc1,crc1 -+ xor crc2,crc2 -+ mov %eax, chunk_bytes -+ shl $3, chunk_bytes # num bytes to process from each lane -+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter -+ jl .Lcrc_3lanes_4x_done -+ -+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop -+ # bookkeeping instructions, which can compete with crc32q for the ALUs. -+.Lcrc_3lanes_4x_loop: -+ crc32q (bufp), crc_init_q -+ crc32q (bufp,chunk_bytes_q), crc1 -+ crc32q (bufp,chunk_bytes_q,2), crc2 -+ crc32q 8(bufp), crc_init_q -+ crc32q 8(bufp,chunk_bytes_q), crc1 -+ crc32q 8(bufp,chunk_bytes_q,2), crc2 -+ crc32q 16(bufp), crc_init_q -+ crc32q 16(bufp,chunk_bytes_q), crc1 -+ crc32q 16(bufp,chunk_bytes_q,2), crc2 -+ crc32q 24(bufp), crc_init_q -+ crc32q 24(bufp,chunk_bytes_q), crc1 -+ crc32q 24(bufp,chunk_bytes_q,2), crc2 -+ add $32, bufp -+ sub $4, %eax -+ jge .Lcrc_3lanes_4x_loop -+ -+.Lcrc_3lanes_4x_done: -+ add $4, %eax -+ jz .Lcrc_3lanes_last_qword -+ -+.Lcrc_3lanes_1x_loop: -+ crc32q (bufp), crc_init_q -+ crc32q (bufp,chunk_bytes_q), crc1 -+ crc32q (bufp,chunk_bytes_q,2), crc2 -+ add $8, bufp -+ dec %eax -+ jnz .Lcrc_3lanes_1x_loop - -- mov block_2, block_0 -+.Lcrc_3lanes_last_qword: -+ crc32q (bufp), crc_init_q -+ crc32q (bufp,chunk_bytes_q), crc1 -+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet - - ################################################################ - ## 4) Combine three results: - ################################################################ - -- lea (K_table-8)(%rip), %bufp # first entry is for idx 1 -- shlq $3, %rax # rax *= 8 -- pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 -- leal (%eax,%eax,2), %eax # rax *= 3 (total *24) -- sub %eax, len # len -= rax*24 -+ lea (K_table-8)(%rip), %rax # first entry is for idx 1 -+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 -+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 -+ sub %eax, len # len -= chunk_bytes * 3 - - movq crc_init_q, %xmm1 # CRC for block 1 - pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 -@@ -206,20 +175,19 @@ LABEL crc_ %i - - pxor %xmm2,%xmm1 - movq %xmm1, %rax -- xor -i*8(block_2), %rax -+ xor (bufp,chunk_bytes_q,2), %rax - mov crc2, crc_init_q - crc32 %rax, crc_init_q -+ lea 8(bufp,chunk_bytes_q,2), bufp - - ################################################################ -- ## 5) Check for end: -+ ## 5) If more blocks remain, goto (2): - ################################################################ - --LABEL crc_ 0 -- ENDBR - cmp $128*24, len -- jae .Lfull_block -+ jae .Lfull_block - cmp $SMALL_SIZE, len -- jae .Lcontinue_block -+ jae .Lpartial_block - - ####################################################################### - ## 6) Process any remainder without interleaving: -@@ -231,47 +199,30 @@ LABEL crc_ 0 - shr $3, %eax - jz .Ldo_dword - .Ldo_qwords: -- crc32q (bufptmp), crc_init_q -- add $8, bufptmp -+ crc32q (bufp), crc_init_q -+ add $8, bufp - dec %eax - jnz .Ldo_qwords - .Ldo_dword: - test $4, len - jz .Ldo_word -- crc32l (bufptmp), crc_init -- add $4, bufptmp -+ crc32l (bufp), crc_init -+ add $4, bufp - .Ldo_word: - test $2, len - jz .Ldo_byte -- crc32w (bufptmp), crc_init -- add $2, bufptmp -+ crc32w (bufp), crc_init -+ add $2, bufp - .Ldo_byte: - test $1, len - jz .Ldone -- crc32b (bufptmp), crc_init -+ crc32b (bufp), crc_init - .Ldone: - mov crc_init, %eax -- popq %rsi -- popq %rdi -- popq %rbx - RET - SYM_FUNC_END(crc_pcl) - - .section .rodata, "a", @progbits -- ################################################################ -- ## jump table Table is 129 entries x 2 bytes each -- ################################################################ --.align 4 --jump_table: -- i=0 --.rept 129 --.altmacro --JMPTBL_ENTRY %i --.noaltmacro -- i=i+1 --.endr -- -- - ################################################################ - ## PCLMULQDQ tables - ## Table is 128 entries x 2 words (8 bytes) each diff --git a/debian/patches/patchset-pf/fixes/0001-tpm-do-not-start-chip-while-suspended.patch b/debian/patches/patchset-pf/fixes/0001-tpm-do-not-start-chip-while-suspended.patch new file mode 100644 index 0000000..4277af3 --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0001-tpm-do-not-start-chip-while-suspended.patch @@ -0,0 +1,94 @@ +From 52af8f543922b47a31ddbb6ffb81f40ad9993309 Mon Sep 17 00:00:00 2001 +From: Thadeu Lima de Souza Cascardo <cascardo@igalia.com> +Date: Fri, 7 Feb 2025 15:07:46 -0300 +Subject: tpm: do not start chip while suspended + +Checking TPM_CHIP_FLAG_SUSPENDED after the call to tpm_find_get_ops() can +lead to a spurious tpm_chip_start() call: + +[35985.503771] i2c i2c-1: Transfer while suspended +[35985.503796] WARNING: CPU: 0 PID: 74 at drivers/i2c/i2c-core.h:56 __i2c_transfer+0xbe/0x810 +[35985.503802] Modules linked in: +[35985.503808] CPU: 0 UID: 0 PID: 74 Comm: hwrng Tainted: G W 6.13.0-next-20250203-00005-gfa0cb5642941 #19 9c3d7f78192f2d38e32010ac9c90fdc71109ef6f +[35985.503814] Tainted: [W]=WARN +[35985.503817] Hardware name: Google Morphius/Morphius, BIOS Google_Morphius.13434.858.0 10/26/2023 +[35985.503819] RIP: 0010:__i2c_transfer+0xbe/0x810 +[35985.503825] Code: 30 01 00 00 4c 89 f7 e8 40 fe d8 ff 48 8b 93 80 01 00 00 48 85 d2 75 03 49 8b 16 48 c7 c7 0a fb 7c a7 48 89 c6 e8 32 ad b0 fe <0f> 0b b8 94 ff ff ff e9 33 04 00 00 be 02 00 00 00 83 fd 02 0f 5 +[35985.503828] RSP: 0018:ffffa106c0333d30 EFLAGS: 00010246 +[35985.503833] RAX: 074ba64aa20f7000 RBX: ffff8aa4c1167120 RCX: 0000000000000000 +[35985.503836] RDX: 0000000000000000 RSI: ffffffffa77ab0e4 RDI: 0000000000000001 +[35985.503838] RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000000 +[35985.503841] R10: 0000000000000004 R11: 00000001000313d5 R12: ffff8aa4c10f1820 +[35985.503843] R13: ffff8aa4c0e243c0 R14: ffff8aa4c1167250 R15: ffff8aa4c1167120 +[35985.503846] FS: 0000000000000000(0000) GS:ffff8aa4eae00000(0000) knlGS:0000000000000000 +[35985.503849] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[35985.503852] CR2: 00007fab0aaf1000 CR3: 0000000105328000 CR4: 00000000003506f0 +[35985.503855] Call Trace: +[35985.503859] <TASK> +[35985.503863] ? __warn+0xd4/0x260 +[35985.503868] ? __i2c_transfer+0xbe/0x810 +[35985.503874] ? report_bug+0xf3/0x210 +[35985.503882] ? handle_bug+0x63/0xb0 +[35985.503887] ? exc_invalid_op+0x16/0x50 +[35985.503892] ? asm_exc_invalid_op+0x16/0x20 +[35985.503904] ? __i2c_transfer+0xbe/0x810 +[35985.503913] tpm_cr50_i2c_transfer_message+0x24/0xf0 +[35985.503920] tpm_cr50_i2c_read+0x8e/0x120 +[35985.503928] tpm_cr50_request_locality+0x75/0x170 +[35985.503935] tpm_chip_start+0x116/0x160 +[35985.503942] tpm_try_get_ops+0x57/0x90 +[35985.503948] tpm_find_get_ops+0x26/0xd0 +[35985.503955] tpm_get_random+0x2d/0x80 + +Don't move forward with tpm_chip_start() inside tpm_try_get_ops(), unless +TPM_CHIP_FLAG_SUSPENDED is not set. tpm_find_get_ops() will return NULL in +such a failure case. + +Fixes: 9265fed6db60 ("tpm: Lock TPM chip in tpm_pm_suspend() first") +Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@igalia.com> +Cc: stable@vger.kernel.org +Cc: Jerry Snitselaar <jsnitsel@redhat.com> +Cc: Mike Seo <mikeseohyungjin@gmail.com> +Cc: Jarkko Sakkinen <jarkko@kernel.org> +Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com> +Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> +--- + drivers/char/tpm/tpm-chip.c | 5 +++++ + drivers/char/tpm/tpm-interface.c | 7 ------- + 2 files changed, 5 insertions(+), 7 deletions(-) + +--- a/drivers/char/tpm/tpm-chip.c ++++ b/drivers/char/tpm/tpm-chip.c +@@ -168,6 +168,11 @@ int tpm_try_get_ops(struct tpm_chip *chi + goto out_ops; + + mutex_lock(&chip->tpm_mutex); ++ ++ /* tmp_chip_start may issue IO that is denied while suspended */ ++ if (chip->flags & TPM_CHIP_FLAG_SUSPENDED) ++ goto out_lock; ++ + rc = tpm_chip_start(chip); + if (rc) + goto out_lock; +--- a/drivers/char/tpm/tpm-interface.c ++++ b/drivers/char/tpm/tpm-interface.c +@@ -445,18 +445,11 @@ int tpm_get_random(struct tpm_chip *chip + if (!chip) + return -ENODEV; + +- /* Give back zero bytes, as TPM chip has not yet fully resumed: */ +- if (chip->flags & TPM_CHIP_FLAG_SUSPENDED) { +- rc = 0; +- goto out; +- } +- + if (chip->flags & TPM_CHIP_FLAG_TPM2) + rc = tpm2_get_random(chip, out, max); + else + rc = tpm1_get_random(chip, out, max); + +-out: + tpm_put_ops(chip); + return rc; + } diff --git a/debian/patches/patchset-pf/fixes/0002-x86-insn_decoder_test-allow-longer-symbol-names.patch b/debian/patches/patchset-pf/fixes/0002-x86-insn_decoder_test-allow-longer-symbol-names.patch new file mode 100644 index 0000000..52efb9e --- /dev/null +++ b/debian/patches/patchset-pf/fixes/0002-x86-insn_decoder_test-allow-longer-symbol-names.patch @@ -0,0 +1,45 @@ +From 2c26fd36ffb4bed4d55f9c7ba8d4f22db093eba2 Mon Sep 17 00:00:00 2001 +From: David Rheinsberg <david@readahead.eu> +Date: Tue, 24 Jan 2023 12:04:59 +0100 +Subject: x86/insn_decoder_test: allow longer symbol-names + +Increase the allowed line-length of the insn-decoder-test to 4k to allow +for symbol-names longer than 256 characters. + +The insn-decoder-test takes objdump output as input, which may contain +symbol-names as instruction arguments. With rust-code entering the +kernel, those symbol-names will include mangled-symbols which might +exceed the current line-length-limit of the tool. + +By bumping the line-length-limit of the tool to 4k, we get a reasonable +buffer for all objdump outputs I have seen so far. Unfortunately, ELF +symbol-names are not restricted in length, so technically this might +still end up failing if we encounter longer names in the future. + +My compile-failure looks like this: + + arch/x86/tools/insn_decoder_test: error: malformed line 1152000: + tBb_+0xf2> + +..which overflowed by 10 characters reading this line: + + ffffffff81458193: 74 3d je ffffffff814581d2 <_RNvXse_NtNtNtCshGpAVYOtgW1_4core4iter8adapters7flattenINtB5_13FlattenCompatINtNtB7_3map3MapNtNtNtBb_3str4iter5CharsNtB1v_17CharEscapeDefaultENtNtBb_4char13EscapeDefaultENtNtBb_3fmt5Debug3fmtBb_+0xf2> + +Signed-off-by: David Rheinsberg <david@readahead.eu> +Signed-off-by: Scott Weaver <scweaver@redhat.com> +Cherry-picked-for: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/63 +--- + arch/x86/tools/insn_decoder_test.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/tools/insn_decoder_test.c ++++ b/arch/x86/tools/insn_decoder_test.c +@@ -106,7 +106,7 @@ static void parse_args(int argc, char ** + } + } + +-#define BUFSIZE 256 ++#define BUFSIZE 4096 + + int main(int argc, char **argv) + { diff --git a/debian/patches/patchset-pf/xfs/0001-xfs-fix-chown-with-rt-quota.patch b/debian/patches/patchset-pf/xfs/0001-xfs-fix-chown-with-rt-quota.patch deleted file mode 100644 index 9b5c4ab..0000000 --- a/debian/patches/patchset-pf/xfs/0001-xfs-fix-chown-with-rt-quota.patch +++ /dev/null @@ -1,178 +0,0 @@ -From 6fe0d820b76da3a4f1f8d1fd605b2afc9edcb3f8 Mon Sep 17 00:00:00 2001 -From: "Darrick J. Wong" <djwong@kernel.org> -Date: Sun, 3 Nov 2024 20:19:39 -0800 -Subject: xfs: fix chown with rt quota - -Make chown's quota adjustments work with realtime files. This is mostly -a matter of calling xfs_inode_count_blocks on a given file to figure out -the number of blocks allocated to the data device and to the realtime -device, and using those quantities to update the quota accounting when -the id changes. Delayed allocation reservations are moved from the old -dquot's incore reservation to the new dquot's incore reservation. - -Note that there was a missing ILOCK bug in xfs_qm_dqusage_adjust that we -must fix before calling xfs_iread_extents. Prior to 2.6.37 the locking -was correct, but then someone removed the ILOCK as part of a cleanup. -Nobody noticed because nowhere in the git history have we ever supported -rt+quota so nobody can use this. - -I'm leaving git breadcrumbs in case anyone is desperate enough to try to -backport the rtquota code to old kernels. - -Not-Cc: <stable@vger.kernel.org> # v2.6.37 -Fixes: 52fda114249578 ("xfs: simplify xfs_qm_dqusage_adjust") -Signed-off-by: Darrick J. Wong <djwong@kernel.org> -Reviewed-by: Christoph Hellwig <hch@lst.de> ---- - fs/xfs/xfs_qm.c | 44 +++++++++++++++++++++++++++----------------- - fs/xfs/xfs_trans.c | 31 +++++++++++++++++++++++++++++-- - 2 files changed, 56 insertions(+), 19 deletions(-) - ---- a/fs/xfs/xfs_qm.c -+++ b/fs/xfs/xfs_qm.c -@@ -1181,8 +1181,8 @@ xfs_qm_dqusage_adjust( - void *data) - { - struct xfs_inode *ip; -- xfs_qcnt_t nblks; -- xfs_filblks_t rtblks = 0; /* total rt blks */ -+ xfs_filblks_t nblks, rtblks; -+ unsigned int lock_mode; - int error; - - ASSERT(XFS_IS_QUOTA_ON(mp)); -@@ -1219,18 +1219,17 @@ xfs_qm_dqusage_adjust( - - ASSERT(ip->i_delayed_blks == 0); - -+ lock_mode = xfs_ilock_data_map_shared(ip); - if (XFS_IS_REALTIME_INODE(ip)) { -- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); -- - error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); -- if (error) -+ if (error) { -+ xfs_iunlock(ip, lock_mode); - goto error0; -- -- xfs_bmap_count_leaves(ifp, &rtblks); -+ } - } -- -- nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks; -+ xfs_inode_count_blocks(tp, ip, &nblks, &rtblks); - xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED); -+ xfs_iunlock(ip, lock_mode); - - /* - * Add the (disk blocks and inode) resources occupied by this -@@ -1892,9 +1891,8 @@ xfs_qm_vop_chown( - struct xfs_dquot *newdq) - { - struct xfs_dquot *prevdq; -- uint bfield = XFS_IS_REALTIME_INODE(ip) ? -- XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; -- -+ xfs_filblks_t dblocks, rblocks; -+ bool isrt = XFS_IS_REALTIME_INODE(ip); - - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - ASSERT(XFS_IS_QUOTA_ON(ip->i_mount)); -@@ -1904,11 +1902,17 @@ xfs_qm_vop_chown( - ASSERT(prevdq); - ASSERT(prevdq != newdq); - -- xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks)); -+ xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks); -+ -+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_BCOUNT, -+ -(xfs_qcnt_t)dblocks); -+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_RTBCOUNT, -+ -(xfs_qcnt_t)rblocks); - xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1); - - /* the sparkling new dquot */ -- xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks); -+ xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_BCOUNT, dblocks); -+ xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_RTBCOUNT, rblocks); - xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1); - - /* -@@ -1918,7 +1922,8 @@ xfs_qm_vop_chown( - * (having already bumped up the real counter) so that we don't have - * any reservation to give back when we commit. - */ -- xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS, -+ xfs_trans_mod_dquot(tp, newdq, -+ isrt ? XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS, - -ip->i_delayed_blks); - - /* -@@ -1930,8 +1935,13 @@ xfs_qm_vop_chown( - */ - tp->t_flags |= XFS_TRANS_DIRTY; - xfs_dqlock(prevdq); -- ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); -- prevdq->q_blk.reserved -= ip->i_delayed_blks; -+ if (isrt) { -+ ASSERT(prevdq->q_rtb.reserved >= ip->i_delayed_blks); -+ prevdq->q_rtb.reserved -= ip->i_delayed_blks; -+ } else { -+ ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); -+ prevdq->q_blk.reserved -= ip->i_delayed_blks; -+ } - xfs_dqunlock(prevdq); - - /* ---- a/fs/xfs/xfs_trans.c -+++ b/fs/xfs/xfs_trans.c -@@ -1257,11 +1257,26 @@ retry: - gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL; - pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL; - if (udqp || gdqp || pdqp) { -+ xfs_filblks_t dblocks, rblocks; - unsigned int qflags = XFS_QMOPT_RES_REGBLKS; -+ bool isrt = XFS_IS_REALTIME_INODE(ip); - - if (force) - qflags |= XFS_QMOPT_FORCE_RES; - -+ if (isrt) { -+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); -+ if (error) -+ goto out_cancel; -+ } -+ -+ xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks); -+ -+ if (isrt) -+ rblocks += ip->i_delayed_blks; -+ else -+ dblocks += ip->i_delayed_blks; -+ - /* - * Reserve enough quota to handle blocks on disk and reserved - * for a delayed allocation. We'll actually transfer the -@@ -1269,8 +1284,20 @@ retry: - * though that part is only semi-transactional. - */ - error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, -- pdqp, ip->i_nblocks + ip->i_delayed_blks, -- 1, qflags); -+ pdqp, dblocks, 1, qflags); -+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) { -+ xfs_trans_cancel(tp); -+ xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); -+ retried = true; -+ goto retry; -+ } -+ if (error) -+ goto out_cancel; -+ -+ /* Do the same for realtime. */ -+ qflags = XFS_QMOPT_RES_RTBLKS | (qflags & XFS_QMOPT_FORCE_RES); -+ error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, -+ pdqp, rblocks, 0, qflags); - if ((error == -EDQUOT || error == -ENOSPC) && !retried) { - xfs_trans_cancel(tp); - xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); diff --git a/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch b/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.7.patch similarity index 73% rename from debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch rename to debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.7.patch index feb072e..25aab08 100644 --- a/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch +++ b/debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.7.patch @@ -1,71 +1,75 @@ -From 444b8286e00345a68fecc43eaa0aabdb10d7b39b Mon Sep 17 00:00:00 2001 +From ce390f13283adf62f17365d2f55e65e442e2edd8 Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko <oleksandr@natalenko.name> -Date: Mon, 29 Jul 2024 00:42:23 +0200 -Subject: zstd: import upstream v1.5.6 +Date: Thu, 20 Feb 2025 09:03:32 +0100 +Subject: zstd: import upstream v1.5.7 Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> --- - include/linux/zstd.h | 2 +- - include/linux/zstd_errors.h | 23 +- - include/linux/zstd_lib.h | 850 +++++-- - lib/zstd/Makefile | 2 +- + include/linux/zstd.h | 86 +- + include/linux/zstd_errors.h | 30 +- + include/linux/zstd_lib.h | 1123 ++++-- + lib/zstd/Makefile | 3 +- lib/zstd/common/allocations.h | 56 + - lib/zstd/common/bits.h | 149 ++ - lib/zstd/common/bitstream.h | 127 +- - lib/zstd/common/compiler.h | 134 +- + lib/zstd/common/bits.h | 150 + + lib/zstd/common/bitstream.h | 155 +- + lib/zstd/common/compiler.h | 151 +- lib/zstd/common/cpu.h | 3 +- lib/zstd/common/debug.c | 9 +- - lib/zstd/common/debug.h | 34 +- + lib/zstd/common/debug.h | 37 +- lib/zstd/common/entropy_common.c | 42 +- - lib/zstd/common/error_private.c | 12 +- - lib/zstd/common/error_private.h | 84 +- - lib/zstd/common/fse.h | 94 +- - lib/zstd/common/fse_decompress.c | 130 +- - lib/zstd/common/huf.h | 237 +- + lib/zstd/common/error_private.c | 13 +- + lib/zstd/common/error_private.h | 88 +- + lib/zstd/common/fse.h | 103 +- + lib/zstd/common/fse_decompress.c | 132 +- + lib/zstd/common/huf.h | 240 +- lib/zstd/common/mem.h | 3 +- - lib/zstd/common/portability_macros.h | 28 +- + lib/zstd/common/portability_macros.h | 45 +- lib/zstd/common/zstd_common.c | 38 +- lib/zstd/common/zstd_deps.h | 16 +- - lib/zstd/common/zstd_internal.h | 109 +- + lib/zstd/common/zstd_internal.h | 153 +- lib/zstd/compress/clevels.h | 3 +- lib/zstd/compress/fse_compress.c | 74 +- - lib/zstd/compress/hist.c | 3 +- - lib/zstd/compress/hist.h | 3 +- - lib/zstd/compress/huf_compress.c | 441 ++-- - lib/zstd/compress/zstd_compress.c | 2111 ++++++++++++----- - lib/zstd/compress/zstd_compress_internal.h | 359 ++- - lib/zstd/compress/zstd_compress_literals.c | 155 +- + lib/zstd/compress/hist.c | 13 +- + lib/zstd/compress/hist.h | 10 +- + lib/zstd/compress/huf_compress.c | 441 ++- + lib/zstd/compress/zstd_compress.c | 3289 ++++++++++++----- + lib/zstd/compress/zstd_compress_internal.h | 621 +++- + lib/zstd/compress/zstd_compress_literals.c | 157 +- lib/zstd/compress/zstd_compress_literals.h | 25 +- - lib/zstd/compress/zstd_compress_sequences.c | 7 +- - lib/zstd/compress/zstd_compress_sequences.h | 3 +- - lib/zstd/compress/zstd_compress_superblock.c | 376 ++- + lib/zstd/compress/zstd_compress_sequences.c | 21 +- + lib/zstd/compress/zstd_compress_sequences.h | 16 +- + lib/zstd/compress/zstd_compress_superblock.c | 394 +- lib/zstd/compress/zstd_compress_superblock.h | 3 +- - lib/zstd/compress/zstd_cwksp.h | 169 +- - lib/zstd/compress/zstd_double_fast.c | 143 +- - lib/zstd/compress/zstd_double_fast.h | 17 +- - lib/zstd/compress/zstd_fast.c | 596 +++-- - lib/zstd/compress/zstd_fast.h | 6 +- - lib/zstd/compress/zstd_lazy.c | 732 +++--- - lib/zstd/compress/zstd_lazy.h | 138 +- - lib/zstd/compress/zstd_ldm.c | 21 +- - lib/zstd/compress/zstd_ldm.h | 3 +- + lib/zstd/compress/zstd_cwksp.h | 222 +- + lib/zstd/compress/zstd_double_fast.c | 245 +- + lib/zstd/compress/zstd_double_fast.h | 27 +- + lib/zstd/compress/zstd_fast.c | 703 +++- + lib/zstd/compress/zstd_fast.h | 16 +- + lib/zstd/compress/zstd_lazy.c | 840 +++-- + lib/zstd/compress/zstd_lazy.h | 195 +- + lib/zstd/compress/zstd_ldm.c | 102 +- + lib/zstd/compress/zstd_ldm.h | 17 +- lib/zstd/compress/zstd_ldm_geartab.h | 3 +- - lib/zstd/compress/zstd_opt.c | 497 ++-- - lib/zstd/compress/zstd_opt.h | 41 +- - lib/zstd/decompress/huf_decompress.c | 887 ++++--- + lib/zstd/compress/zstd_opt.c | 571 +-- + lib/zstd/compress/zstd_opt.h | 55 +- + lib/zstd/compress/zstd_preSplit.c | 239 ++ + lib/zstd/compress/zstd_preSplit.h | 34 + + lib/zstd/decompress/huf_decompress.c | 887 +++-- lib/zstd/decompress/zstd_ddict.c | 9 +- lib/zstd/decompress/zstd_ddict.h | 3 +- - lib/zstd/decompress/zstd_decompress.c | 356 ++- - lib/zstd/decompress/zstd_decompress_block.c | 708 +++--- + lib/zstd/decompress/zstd_decompress.c | 375 +- + lib/zstd/decompress/zstd_decompress_block.c | 724 ++-- lib/zstd/decompress/zstd_decompress_block.h | 10 +- - .../decompress/zstd_decompress_internal.h | 9 +- + .../decompress/zstd_decompress_internal.h | 19 +- lib/zstd/decompress_sources.h | 2 +- lib/zstd/zstd_common_module.c | 5 +- - lib/zstd/zstd_compress_module.c | 2 +- + lib/zstd/zstd_compress_module.c | 75 +- lib/zstd/zstd_decompress_module.c | 4 +- - 58 files changed, 6576 insertions(+), 3530 deletions(-) + 60 files changed, 8746 insertions(+), 4379 deletions(-) create mode 100644 lib/zstd/common/allocations.h create mode 100644 lib/zstd/common/bits.h + create mode 100644 lib/zstd/compress/zstd_preSplit.c + create mode 100644 lib/zstd/compress/zstd_preSplit.h --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -77,6 +81,137 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the +@@ -160,6 +160,19 @@ typedef ZSTD_parameters zstd_parameters; + zstd_parameters zstd_get_params(int level, + unsigned long long estimated_src_size); + ++typedef ZSTD_CCtx zstd_cctx; ++typedef ZSTD_cParameter zstd_cparameter; ++ ++/** ++ * zstd_cctx_set_param() - sets a compression parameter ++ * @cctx: The context. Must have been initialized with zstd_init_cctx(). ++ * @param: The parameter to set. ++ * @value: The value to set the parameter to. ++ * ++ * Return: Zero or an error, which can be checked using zstd_is_error(). ++ */ ++size_t zstd_cctx_set_param(zstd_cctx *cctx, zstd_cparameter param, int value); ++ + + /** + * zstd_get_cparams() - returns zstd_compression_parameters for selected level +@@ -175,8 +188,6 @@ zstd_compression_parameters zstd_get_cpa + + /* ====== Single-pass Compression ====== */ + +-typedef ZSTD_CCtx zstd_cctx; +- + /** + * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx + * @parameters: The compression parameters to be used. +@@ -191,6 +202,20 @@ typedef ZSTD_CCtx zstd_cctx; + size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters); + + /** ++ * zstd_cctx_workspace_bound_with_ext_seq_prod() - max memory needed to ++ * initialize a zstd_cctx when using the block-level external sequence ++ * producer API. ++ * @parameters: The compression parameters to be used. ++ * ++ * If multiple compression parameters might be used, the caller must call ++ * this function for each set of parameters and use the maximum size. ++ * ++ * Return: A lower bound on the size of the workspace that is passed to ++ * zstd_init_cctx(). ++ */ ++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *parameters); ++ ++/** + * zstd_init_cctx() - initialize a zstd compression context + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. +@@ -425,6 +450,16 @@ typedef ZSTD_CStream zstd_cstream; + size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams); + + /** ++ * zstd_cstream_workspace_bound_with_ext_seq_prod() - memory needed to initialize ++ * a zstd_cstream when using the block-level external sequence producer API. ++ * @cparams: The compression parameters to be used for compression. ++ * ++ * Return: A lower bound on the size of the workspace that is passed to ++ * zstd_init_cstream(). ++ */ ++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *cparams); ++ ++/** + * zstd_init_cstream() - initialize a zstd streaming compression context + * @parameters The zstd parameters to use for compression. + * @pledged_src_size: If params.fParams.contentSizeFlag == 1 then the caller +@@ -584,6 +619,18 @@ size_t zstd_decompress_stream(zstd_dstre + size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); + + /** ++ * zstd_register_sequence_producer() - exposes the zstd library function ++ * ZSTD_registerSequenceProducer(). This is used for the block-level external ++ * sequence producer API. See upstream zstd.h for detailed documentation. ++ */ ++typedef ZSTD_sequenceProducer_F zstd_sequence_producer_f; ++void zstd_register_sequence_producer( ++ zstd_cctx *cctx, ++ void* sequence_producer_state, ++ zstd_sequence_producer_f sequence_producer ++); ++ ++/** + * struct zstd_frame_params - zstd frame parameters stored in the frame header + * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not + * present. +@@ -596,7 +643,7 @@ size_t zstd_find_frame_compressed_size(c + * + * See zstd_lib.h. + */ +-typedef ZSTD_frameHeader zstd_frame_header; ++typedef ZSTD_FrameHeader zstd_frame_header; + + /** + * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame +@@ -611,4 +658,35 @@ typedef ZSTD_frameHeader zstd_frame_head + size_t zstd_get_frame_header(zstd_frame_header *params, const void *src, + size_t src_size); + ++/** ++ * struct zstd_sequence - a sequence of literals or a match ++ * ++ * @offset: The offset of the match ++ * @litLength: The literal length of the sequence ++ * @matchLength: The match length of the sequence ++ * @rep: Represents which repeat offset is used ++ */ ++typedef ZSTD_Sequence zstd_sequence; ++ ++/** ++ * zstd_compress_sequences_and_literals() - compress an array of zstd_sequence and literals ++ * ++ * @cctx: The zstd compression context. ++ * @dst: The buffer to compress the data into. ++ * @dst_capacity: The size of the destination buffer. ++ * @in_seqs: The array of zstd_sequence to compress. ++ * @in_seqs_size: The number of sequences in in_seqs. ++ * @literals: The literals associated to the sequences to be compressed. ++ * @lit_size: The size of the literals in the literals buffer. ++ * @lit_capacity: The size of the literals buffer. ++ * @decompressed_size: The size of the input data ++ * ++ * Return: The compressed size or an error, which can be checked using ++ * zstd_is_error(). ++ */ ++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity, ++ const zstd_sequence *in_seqs, size_t in_seqs_size, ++ const void* literals, size_t lit_size, size_t lit_capacity, ++ size_t decompressed_size); ++ + #endif /* LINUX_ZSTD_H */ --- a/include/linux/zstd_errors.h +++ b/include/linux/zstd_errors.h @@ -1,5 +1,6 @@ @@ -87,14 +222,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -17,8 +18,17 @@ +@@ -12,13 +13,18 @@ + #define ZSTD_ERRORS_H_398273423 - /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ --#define ZSTDERRORLIB_VISIBILITY --#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY +-/*===== dependency =====*/ +-#include <linux/types.h> /* size_t */ ++/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#define ZSTDERRORLIB_VISIBLE -+ + +#ifndef ZSTDERRORLIB_HIDDEN +# if (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) @@ -102,12 +238,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +# define ZSTDERRORLIB_HIDDEN +# endif +#endif -+ + +-/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +-#define ZSTDERRORLIB_VISIBILITY +-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY +#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE /*-********************************************* * Error codes list -@@ -43,14 +53,17 @@ typedef enum { +@@ -43,14 +49,18 @@ typedef enum { ZSTD_error_frameParameter_windowTooLarge = 16, ZSTD_error_corruption_detected = 20, ZSTD_error_checksum_wrong = 22, @@ -121,11 +260,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_error_tableLog_tooLarge = 44, ZSTD_error_maxSymbolValue_tooLarge = 46, ZSTD_error_maxSymbolValue_tooSmall = 48, ++ ZSTD_error_cannotProduce_uncompressedBlock = 49, + ZSTD_error_stabilityCondition_notRespected = 50, ZSTD_error_stage_wrong = 60, ZSTD_error_init_missing = 62, ZSTD_error_memory_allocation = 64, -@@ -58,11 +71,15 @@ typedef enum { +@@ -58,18 +68,18 @@ typedef enum { ZSTD_error_dstSize_tooSmall = 70, ZSTD_error_srcSize_wrong = 72, ZSTD_error_dstBuffer_null = 74, @@ -141,6 +281,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ } ZSTD_ErrorCode; +-/*! ZSTD_getErrorCode() : +- convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, +- which can be used to compare with enum list published above */ +-ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); + ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + --- a/include/linux/zstd_lib.h +++ b/include/linux/zstd_lib.h @@ -1,5 +1,6 @@ @@ -151,15 +298,21 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -11,23 +12,42 @@ +@@ -11,23 +12,47 @@ #ifndef ZSTD_H_235446 #define ZSTD_H_235446 -/* ====== Dependency ======*/ +-#include <linux/limits.h> /* INT_MAX */ ++ +/* ====== Dependencies ======*/ - #include <linux/limits.h> /* INT_MAX */ #include <linux/types.h> /* size_t */ ++#include <linux/zstd_errors.h> /* list of errors */ ++#if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) ++#include <linux/limits.h> /* INT_MAX */ ++#endif /* ZSTD_STATIC_LINKING_ONLY */ ++ /* ===== ZSTDLIB_API : control library symbols visibility ===== */ -#ifndef ZSTDLIB_VISIBLE @@ -185,7 +338,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ +#else -+# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) ++# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) || defined(__IAR_SYSTEMS_ICC__) +# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) +# elif (__GNUC__ >= 3) +# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) @@ -198,16 +351,21 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* ***************************************************************************** Introduction -@@ -65,7 +85,7 @@ +@@ -65,7 +90,7 @@ /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 #define ZSTD_VERSION_MINOR 5 -#define ZSTD_VERSION_RELEASE 2 -+#define ZSTD_VERSION_RELEASE 6 ++#define ZSTD_VERSION_RELEASE 7 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) /*! ZSTD_versionNumber() : -@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionStri +@@ -103,11 +128,12 @@ ZSTDLIB_API const char* ZSTD_versionStri + + + /* ************************************* +-* Simple API ++* Simple Core API ***************************************/ /*! ZSTD_compress() : * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. @@ -217,30 +375,122 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * @return : compressed size written into `dst` (<= `dstCapacity), * or an error code if it fails (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, -@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getF +@@ -115,47 +141,55 @@ ZSTDLIB_API size_t ZSTD_compress( void* + int compressionLevel); + + /*! ZSTD_decompress() : +- * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. +- * `dstCapacity` is an upper bound of originalSize to regenerate. +- * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. +- * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), +- * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ ++ * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. ++ * Multiple compressed frames can be decompressed at once with this method. ++ * The result will be the concatenation of all decompressed frames, back to back. ++ * `dstCapacity` is an upper bound of originalSize to regenerate. ++ * First frame's decompressed size can be extracted using ZSTD_getFrameContentSize(). ++ * If maximum upper bound isn't known, prefer using streaming mode to decompress data. ++ * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), ++ * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + ++ ++/*====== Decompression helper functions ======*/ ++ + /*! ZSTD_getFrameContentSize() : requires v1.3.0+ +- * `src` should point to the start of a ZSTD encoded frame. +- * `srcSize` must be at least as large as the frame header. +- * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. +- * @return : - decompressed size of `src` frame content, if known +- * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined +- * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) +- * note 1 : a 0 return value means the frame is valid but "empty". +- * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. +- * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. +- * In which case, it's necessary to use streaming mode to decompress data. +- * Optionally, application can rely on some implicit limit, +- * as ZSTD_decompress() only needs an upper bound of decompressed size. +- * (For example, data could be necessarily cut into blocks <= 16 KB). +- * note 3 : decompressed size is always present when compression is completed using single-pass functions, +- * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). +- * note 4 : decompressed size can be very large (64-bits value), +- * potentially larger than what local system can handle as a single memory segment. +- * In which case, it's necessary to use streaming mode to decompress data. +- * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. +- * Always ensure return value fits within application's authorized limits. +- * Each application can set its own limits. +- * note 6 : This function replaces ZSTD_getDecompressedSize() */ ++ * `src` should point to the start of a ZSTD encoded frame. ++ * `srcSize` must be at least as large as the frame header. ++ * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. ++ * @return : - decompressed size of `src` frame content, if known ++ * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined ++ * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) ++ * note 1 : a 0 return value means the frame is valid but "empty". ++ * When invoking this method on a skippable frame, it will return 0. ++ * note 2 : decompressed size is an optional field, it may not be present (typically in streaming mode). ++ * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. ++ * In which case, it's necessary to use streaming mode to decompress data. ++ * Optionally, application can rely on some implicit limit, ++ * as ZSTD_decompress() only needs an upper bound of decompressed size. ++ * (For example, data could be necessarily cut into blocks <= 16 KB). ++ * note 3 : decompressed size is always present when compression is completed using single-pass functions, ++ * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). ++ * note 4 : decompressed size can be very large (64-bits value), ++ * potentially larger than what local system can handle as a single memory segment. ++ * In which case, it's necessary to use streaming mode to decompress data. ++ * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. ++ * Always ensure return value fits within application's authorized limits. ++ * Each application can set its own limits. ++ * note 6 : This function replaces ZSTD_getDecompressedSize() */ + #define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) + #define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) + ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +-/*! ZSTD_getDecompressedSize() : +- * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). ++/*! ZSTD_getDecompressedSize() (obsolete): ++ * This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends * "empty", "unknown" and "error" results to the same return value (0), * while ZSTD_getFrameContentSize() gives them separate return values. * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ --ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); +ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") -+ZSTDLIB_API -+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ - * `src` should point to the start of a ZSTD frame or skippable frame. -@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompres +@@ -163,18 +197,50 @@ ZSTDLIB_API unsigned long long ZSTD_getD + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, +- * or an error code if input is invalid */ ++ * or an error code if input is invalid ++ * Note 1: this method is called _find*() because it's not enough to read the header, ++ * it may have to scan through the frame's content, to reach its end. ++ * Note 2: this method also works with Skippable Frames. In which case, ++ * it returns the size of the complete skippable frame, ++ * which is always equal to its content size + 8 bytes for headers. */ + ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); - /*====== Helper functions ======*/ +-/*====== Helper functions ======*/ -#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ -ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ -+/* ZSTD_compressBound() : +-ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +-ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +-ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +-ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ +-ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ ++/*====== Compression helper functions ======*/ ++ ++/*! ZSTD_compressBound() : + * maximum compressed size in worst case single-pass scenario. -+ * When invoking `ZSTD_compress()` or any other one-pass compression function, ++ * When invoking `ZSTD_compress()`, or any other one-pass compression function, + * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) + * as it eliminates one potential failure scenario, + * aka not enough room in dst buffer to write the compressed frame. -+ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE . ++ * Note : ZSTD_compressBound() itself can fail, if @srcSize >= ZSTD_MAX_INPUT_SIZE . + * In which case, ZSTD_compressBound() will return an error code + * which can be tested using ZSTD_isError(). + * @@ -248,29 +498,49 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * same as ZSTD_compressBound(), but as a macro. + * It can be used to produce constants, which can be useful for static allocation, + * for example to size a static array on stack. -+ * Will produce constant value 0 if srcSize too large. ++ * Will produce constant value 0 if srcSize is too large. + */ +#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) +#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++ ++ ++/*====== Error helper functions ======*/ +/* ZSTD_isError() : + * Most ZSTD_* functions returning a size_t value can be tested for error, + * using ZSTD_isError(). + * @return 1 if error, 0 otherwise + */ - ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ - ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ - ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ -@@ -183,7 +228,7 @@ ZSTDLIB_API int ZSTD_defaultCLev ++ZSTDLIB_API unsigned ZSTD_isError(size_t result); /*!< tells if a `size_t` function result is an error code */ ++ZSTDLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); /* convert a result into an error code, which can be compared to error enum list */ ++ZSTDLIB_API const char* ZSTD_getErrorName(size_t result); /*!< provides readable string from a function result */ ++ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ ++ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ ++ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ + + + /* ************************************* +@@ -182,25 +248,25 @@ ZSTDLIB_API int ZSTD_defaultCLev + ***************************************/ /*= Compression context * When compressing many times, - * it is recommended to allocate a context just once, +- * it is recommended to allocate a context just once, - * and re-use it for each successive compression operation. +- * This will make workload friendlier for system's memory. ++ * it is recommended to allocate a compression context just once, + * and reuse it for each successive compression operation. - * This will make workload friendlier for system's memory. ++ * This will make the workload easier for system's memory. * Note : re-using context is just a speed / resource optimization. * It doesn't change the compression ratio, which remains identical. -@@ -196,9 +241,9 @@ ZSTDLIB_API size_t ZSTD_freeCCtx(ZST +- * Note 2 : In multi-threaded environments, +- * use one different context per thread for parallel execution. ++ * Note 2: For parallel execution in multi-threaded environments, ++ * use one different context per thread . + */ + typedef struct ZSTD_CCtx_s ZSTD_CCtx; + ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); +-ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer */ ++ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* compatible with NULL pointer */ /*! ZSTD_compressCCtx() : * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. @@ -281,9 +551,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * this function compresses at the requested compression level, + * __ignoring any other advanced parameter__ . * If any advanced parameter was set using the advanced API, - * they will all be reset. Only `compressionLevel` remains. +- * they will all be reset. Only `compressionLevel` remains. ++ * they will all be reset. Only @compressionLevel remains. */ -@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZST + ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, +@@ -210,7 +276,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZST /*= Decompression context * When decompressing many times, * it is recommended to allocate a context only once, @@ -292,7 +565,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * This will make workload friendlier for system's memory. * Use one context per thread for parallel execution. */ typedef struct ZSTD_DCtx_s ZSTD_DCtx; -@@ -220,7 +265,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZST +@@ -220,7 +286,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZST /*! ZSTD_decompressDCtx() : * Same as ZSTD_decompress(), * requires an allocated ZSTD_DCtx. @@ -301,7 +574,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, -@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(Z +@@ -236,12 +302,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(Z * using ZSTD_CCtx_set*() functions. * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! @@ -316,13 +589,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ -@@ -324,6 +369,19 @@ typedef enum { +@@ -324,6 +390,19 @@ typedef enum { * The higher the value of selected strategy, the more complex it is, * resulting in stronger and slower compression. * Special: value 0 means "use default strategy". */ + + ZSTD_c_targetCBlockSize=130, /* v1.5.6+ -+ * Attempts to fit compressed block size into approximatively targetCBlockSize. ++ * Attempts to fit compressed block size into approximately targetCBlockSize. + * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. + * Note that it's not a guarantee, just a convergence target (default:0). + * No target when targetCBlockSize == 0. @@ -336,7 +609,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* LDM mode parameters */ ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. * This parameter is designed to improve compression ratio -@@ -403,7 +461,6 @@ typedef enum { +@@ -403,15 +482,18 @@ typedef enum { * ZSTD_c_forceMaxWindow * ZSTD_c_forceAttachDict * ZSTD_c_literalCompressionMode @@ -344,9 +617,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * ZSTD_c_srcSizeHint * ZSTD_c_enableDedicatedDictSearch * ZSTD_c_stableInBuffer -@@ -412,6 +469,9 @@ typedef enum { + * ZSTD_c_stableOutBuffer + * ZSTD_c_blockDelimiters * ZSTD_c_validateSequences - * ZSTD_c_useBlockSplitter +- * ZSTD_c_useBlockSplitter ++ * ZSTD_c_blockSplitterLevel ++ * ZSTD_c_splitAfterSequences * ZSTD_c_useRowMatchFinder + * ZSTD_c_prefetchCDictTables + * ZSTD_c_enableSeqProducerFallback @@ -354,7 +630,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. -@@ -421,7 +481,7 @@ typedef enum { +@@ -421,7 +503,7 @@ typedef enum { ZSTD_c_experimentalParam3=1000, ZSTD_c_experimentalParam4=1001, ZSTD_c_experimentalParam5=1002, @@ -363,7 +639,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_c_experimentalParam7=1004, ZSTD_c_experimentalParam8=1005, ZSTD_c_experimentalParam9=1006, -@@ -430,7 +490,11 @@ typedef enum { +@@ -430,7 +512,12 @@ typedef enum { ZSTD_c_experimentalParam12=1009, ZSTD_c_experimentalParam13=1010, ZSTD_c_experimentalParam14=1011, @@ -372,11 +648,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + ZSTD_c_experimentalParam16=1013, + ZSTD_c_experimentalParam17=1014, + ZSTD_c_experimentalParam18=1015, -+ ZSTD_c_experimentalParam19=1016 ++ ZSTD_c_experimentalParam19=1016, ++ ZSTD_c_experimentalParam20=1017 } ZSTD_cParameter; typedef struct { -@@ -493,7 +557,7 @@ typedef enum { +@@ -493,7 +580,7 @@ typedef enum { * They will be used to compress next frame. * Resetting session never fails. * - The parameters : changes all parameters back to "default". @@ -385,7 +662,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) * - Both : similar to resetting the session, followed by resetting parameters. -@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_ +@@ -502,11 +589,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_ /*! ZSTD_compress2() : * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. @@ -400,7 +677,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * @return : compressed size written into `dst` (<= `dstCapacity), * or an error code if it fails (which can be tested using ZSTD_isError()). */ -@@ -543,13 +609,17 @@ typedef enum { +@@ -543,13 +632,17 @@ typedef enum { * ZSTD_d_stableOutBuffer * ZSTD_d_forceIgnoreChecksum * ZSTD_d_refMultipleDDicts @@ -419,7 +696,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } ZSTD_dParameter; -@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s { +@@ -604,14 +697,14 @@ typedef struct ZSTD_outBuffer_s { * A ZSTD_CStream object is required to track streaming operation. * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. @@ -436,7 +713,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * When in doubt, it's recommended to fully initialize the context before usage. * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to -@@ -700,6 +770,11 @@ typedef enum { +@@ -700,6 +793,11 @@ typedef enum { * only ZSTD_e_end or ZSTD_e_flush operations are allowed. * Before starting a new compression job, or changing compression parameters, * it is required to fully flush internal buffers. @@ -448,7 +725,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, ZSTD_outBuffer* output, -@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(v +@@ -728,8 +826,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(v * This following is a legacy streaming API, available since v1.0+ . * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). * It is redundant, but remains fully supported. @@ -457,7 +734,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ******************************************************************************/ /*! -@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(v +@@ -738,6 +834,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(v * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); @@ -467,16 +744,40 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); /*! -@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_C +@@ -758,7 +857,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_C * * A ZSTD_DStream object is required to track streaming operations. * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. -* ZSTD_DStream objects can be re-used multiple times. -+* ZSTD_DStream objects can be reused multiple times. ++* ZSTD_DStream objects can be re-employed multiple times. * * Use ZSTD_initDStream() to start a new decompression operation. * @return : recommended first input size -@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD +@@ -768,16 +867,21 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_C + * The function will update both `pos` fields. + * If `input.pos < input.size`, some input has not been consumed. + * It's up to the caller to present again remaining data. ++* + * The function tries to flush all data decoded immediately, respecting output buffer size. + * If `output.pos < output.size`, decoder has flushed everything it could. +-* But if `output.pos == output.size`, there might be some data left within internal buffers., ++* ++* However, when `output.pos == output.size`, it's more difficult to know. ++* If @return > 0, the frame is not complete, meaning ++* either there is still some data left to flush within internal buffers, ++* or there is more input to read to complete the frame (or both). + * In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. + * Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. + * @return : 0 when a frame is completely decoded and fully flushed, + * or an error code, which can be tested using ZSTD_isError(), + * or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : + * the return value is a suggested next input size (just a hint for better latency) +-* that will never request more than the remaining frame size. ++* that will never request more than the remaining content of the compressed frame. + * *******************************************************************************/ + + typedef ZSTD_DCtx ZSTD_DStream; /*< DCtx and DStream are now effectively same object (>= v1.3.0) */ +@@ -788,13 +892,38 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD /*===== Streaming decompression functions =====*/ @@ -497,9 +798,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * Function will update both input and output `pos` fields exposing current state via these fields: + * - `input.pos < input.size`, some input remaining and caller should provide remaining input + * on the next call. -+ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers. -+ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers, -+ * call ZSTD_decompressStream() again to flush remaining data to output. ++ * - `output.pos < output.size`, decoder flushed internal output buffer. ++ * - `output.pos == output.size`, unflushed data potentially present in the internal buffers, ++ * check ZSTD_decompressStream() @return value, ++ * if > 0, invoke it again to flush remaining data to output. + * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. + * + * @return : 0 when a frame is completely decoded and fully flushed, @@ -515,7 +817,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ -@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from +@@ -913,7 +1042,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from * If @return == 0, the dictID could not be decoded. * This could for one of the following reasons : * - The frame does not require a dictionary to be decoded (most common case). @@ -524,7 +826,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * Note : this use case also happens when using a non-conformant dictionary. * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). * - This is not a Zstandard frame. -@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from +@@ -925,9 +1054,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from * Advanced dictionary and prefix API (Requires v1.4.0+) * * This API allows dictionaries to be used with ZSTD_compress2(), @@ -539,7 +841,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ******************************************************************************/ -@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from +@@ -937,8 +1068,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, * meaning "return to no-dictionary mode". @@ -551,7 +853,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * Note 2 : Loading a dictionary involves building tables. * It's also a CPU consuming operation, with non-negligible impact on latency. * Tables are dependent on compression parameters, and for this reason, -@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from +@@ -947,11 +1079,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_from * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. * In such a case, dictionary buffer must outlive its users. * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() @@ -569,7 +871,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * Note that compression parameters are enforced from within CDict, * and supersede any compression parameter previously set within CCtx. * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. -@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZS +@@ -970,6 +1106,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZS * Decompression will need same prefix to properly regenerate data. * Compressing with a prefix is similar in outcome as performing a diff and compressing it, * but performs much faster, especially during decompression (compression speed is tunable with compression level). @@ -577,7 +879,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary * Note 1 : Prefix buffer is referenced. It **must** outlive compression. -@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(Z +@@ -986,9 +1123,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(Z const void* prefix, size_t prefixSize); /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ @@ -590,7 +892,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, * meaning "return to no-dictionary mode". -@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDiction +@@ -1012,9 +1149,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDiction * The memory for the table is allocated on the first call to refDDict, and can be * freed with ZSTD_freeDCtx(). * @@ -603,7 +905,21 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * Special: referencing a NULL DDict means "return to no-dictionary mode". * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. */ -@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con +@@ -1051,6 +1189,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DStream(c + ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); + ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + ++ + #endif /* ZSTD_H_235446 */ + + +@@ -1066,29 +1205,12 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con + #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) + #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + ++ + /* This can be overridden externally to hide static symbols. */ + #ifndef ZSTDLIB_STATIC_API #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE #endif @@ -628,7 +944,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* ************************************************************************************** * experimental API (static linking only) **************************************************************************************** -@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con +@@ -1123,6 +1245,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ #define ZSTD_STRATEGY_MIN ZSTD_fast #define ZSTD_STRATEGY_MAX ZSTD_btultra2 @@ -636,7 +952,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #define ZSTD_OVERLAPLOG_MIN 0 -@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con +@@ -1146,7 +1269,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(con #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) /* Advanced parameter bounds */ @@ -645,8 +961,32 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX #define ZSTD_SRCSIZEHINT_MIN 0 #define ZSTD_SRCSIZEHINT_MAX INT_MAX -@@ -1303,7 +1395,7 @@ typedef enum { - } ZSTD_paramSwitch_e; +@@ -1188,7 +1311,7 @@ typedef struct { + * + * Note: This field is optional. ZSTD_generateSequences() will calculate the value of + * 'rep', but repeat offsets do not necessarily need to be calculated from an external +- * sequence provider's perspective. For example, ZSTD_compressSequences() does not ++ * sequence provider perspective. For example, ZSTD_compressSequences() does not + * use this 'rep' field at all (as of now). + */ + } ZSTD_Sequence; +@@ -1293,17 +1416,18 @@ typedef enum { + } ZSTD_literalCompressionMode_e; + + typedef enum { +- /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final +- * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable +- * or ZSTD_ps_disable allow for a force enable/disable the feature. ++ /* Note: This enum controls features which are conditionally beneficial. ++ * Zstd can take a decision on whether or not to enable the feature (ZSTD_ps_auto), ++ * but setting the switch to ZSTD_ps_enable or ZSTD_ps_disable force enable/disable the feature. + */ + ZSTD_ps_auto = 0, /* Let the library automatically determine whether the feature shall be enabled */ + ZSTD_ps_enable = 1, /* Force-enable the feature */ + ZSTD_ps_disable = 2 /* Do not use the feature */ +-} ZSTD_paramSwitch_e; ++} ZSTD_ParamSwitch_e; ++#define ZSTD_paramSwitch_e ZSTD_ParamSwitch_e /* old name */ /* ************************************* -* Frame size functions @@ -654,33 +994,41 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ***************************************/ /*! ZSTD_findDecompressedSize() : -@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZS +@@ -1345,34 +1469,130 @@ ZSTDLIB_STATIC_API unsigned long long ZS + ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); + + /*! ZSTD_frameHeaderSize() : +- * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. ++ * srcSize must be large enough, aka >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * @return : size of the Frame Header, * or an error code (if srcSize is too small) */ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); -+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; ++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_FrameType_e; ++#define ZSTD_frameType_e ZSTD_FrameType_e /* old name */ +typedef struct { + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; -+ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ ++ ZSTD_FrameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; -+ unsigned dictID; ++ unsigned dictID; /* for ZSTD_skippableFrame, contains the skippable magic variant [0-15] */ + unsigned checksumFlag; + unsigned _reserved1; + unsigned _reserved2; -+} ZSTD_frameHeader; ++} ZSTD_FrameHeader; ++#define ZSTD_frameHeader ZSTD_FrameHeader /* old name */ + +/*! ZSTD_getFrameHeader() : -+ * decode Frame Header, or requires larger `srcSize`. -+ * @return : 0, `zfhPtr` is correctly filled, -+ * >0, `srcSize` is too small, value is wanted `srcSize` amount, ++ * decode Frame Header into `zfhPtr`, or requires larger `srcSize`. ++ * @return : 0 => header is complete, `zfhPtr` is correctly filled, ++ * >0 => `srcSize` is too small, @return value is the wanted `srcSize` amount, `zfhPtr` is not filled, + * or an error code, which can be tested using ZSTD_isError() */ -+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize); +/*! ZSTD_getFrameHeader_advanced() : + * same as ZSTD_getFrameHeader(), + * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ -+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + +/*! ZSTD_decompressionMargin() : + * Zstd supports in-place decompression, where the input and output buffers overlap. @@ -728,10 +1076,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + )) + typedef enum { - ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ - ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ - } ZSTD_sequenceFormat_e; - +- ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ +- ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ +-} ZSTD_sequenceFormat_e; ++ ZSTD_sf_noBlockDelimiters = 0, /* ZSTD_Sequence[] has no block delimiters, just sequences */ ++ ZSTD_sf_explicitBlockDelimiters = 1 /* ZSTD_Sequence[] contains explicit block delimiters */ ++} ZSTD_SequenceFormat_e; ++#define ZSTD_sequenceFormat_e ZSTD_SequenceFormat_e /* old name */ ++ +/*! ZSTD_sequenceBound() : + * `srcSize` : size of the input buffer + * @return : upper-bound for the number of sequences that can be generated @@ -740,7 +1092,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). + */ +ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); -+ + /*! ZSTD_generateSequences() : - * Generate sequences using ZSTD_compress2, given a source buffer. + * WARNING: This function is meant for debugging and informational purposes ONLY! @@ -755,7 +1107,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * @param zc The compression context to be used for ZSTD_compress2(). Set any + * compression parameters you need on this context. + * @param outSeqs The output sequences buffer of size @p outSeqsSize -+ * @param outSeqsSize The size of the output sequences buffer. ++ * @param outSeqsCapacity The size of the output sequences buffer. + * ZSTD_sequenceBound(srcSize) is an upper bound on the number + * of sequences that can be generated. + * @param src The source buffer to generate sequences from of size @p srcSize. @@ -783,40 +1135,146 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") +ZSTDLIB_STATIC_API size_t +ZSTD_generateSequences(ZSTD_CCtx* zc, -+ ZSTD_Sequence* outSeqs, size_t outSeqsSize, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize); /*! ZSTD_mergeBlockDelimiters() : * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals -@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateS +@@ -1388,8 +1608,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateS ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); /*! ZSTD_compressSequences() : - * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. +- * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) + * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. + * @src contains the entire input (not just the literals). + * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals - * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) ++ * If a dictionary is included, then the cctx should reference the dict (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.). * The entire source is compressed into a single frame. * -@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBloc - * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. - * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, - * and cannot emit an RLE block that disagrees with the repcode history + * The compression behavior changes based on cctx params. In particular: +@@ -1398,11 +1620,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBloc + * the block size derived from the cctx, and sequences may be split. This is the default setting. + * + * If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain +- * block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. ++ * valid block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. + * +- * If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined +- * behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for +- * specifics regarding offset/matchlength requirements) then the function will bail out and return an error. ++ * When ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, it's possible to decide generating repcodes ++ * using the advanced parameter ZSTD_c_repcodeResolution. Repcodes will improve compression ratio, though the benefit ++ * can vary greatly depending on Sequences. On the other hand, repcode resolution is an expensive operation. ++ * By default, it's disabled at low (<10) compression levels, and enabled above the threshold (>=10). ++ * ZSTD_c_repcodeResolution makes it possible to directly manage this processing in either direction. ++ * ++ * If ZSTD_c_validateSequences == 0, this function blindly accepts the Sequences provided. Invalid Sequences cause undefined ++ * behavior. If ZSTD_c_validateSequences == 1, then the function will detect invalid Sequences (see doc/zstd_compression_format.md for ++ * specifics regarding offset/matchlength requirements) and then bail out and return an error. + * + * In addition to the two adjustable experimental params, there are other important cctx params. + * - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN. +@@ -1410,14 +1638,42 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBloc + * - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset + * is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md + * +- * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. +- * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, +- * and cannot emit an RLE block that disagrees with the repcode history - * @return : final compressed size or a ZSTD error. -+ * @return : final compressed size, or a ZSTD error code. - */ +- */ -ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, - const ZSTD_Sequence* inSeqs, size_t inSeqsSize, - const void* src, size_t srcSize); ++ * Note: Repcodes are, as of now, always re-calculated within this function, ZSTD_Sequence.rep is effectively unused. ++ * Dev Note: Once ability to ingest repcodes become available, the explicit block delims mode must respect those repcodes exactly, ++ * and cannot emit an RLE block that disagrees with the repcode history. ++ * @return : final compressed size, or a ZSTD error code. ++ */ +ZSTDLIB_STATIC_API size_t -+ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize, -+ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, -+ const void* src, size_t srcSize); ++ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* src, size_t srcSize); ++ ++ ++/*! ZSTD_compressSequencesAndLiterals() : ++ * This is a variant of ZSTD_compressSequences() which, ++ * instead of receiving (src,srcSize) as input parameter, receives (literals,litSize), ++ * aka all the literals, already extracted and laid out into a single continuous buffer. ++ * This can be useful if the process generating the sequences also happens to generate the buffer of literals, ++ * thus skipping an extraction + caching stage. ++ * It's a speed optimization, useful when the right conditions are met, ++ * but it also features the following limitations: ++ * - Only supports explicit delimiter mode ++ * - Currently does not support Sequences validation (so input Sequences are trusted) ++ * - Not compatible with frame checksum, which must be disabled ++ * - If any block is incompressible, will fail and return an error ++ * - @litSize must be == sum of all @.litLength fields in @inSeqs. Any discrepancy will generate an error. ++ * - @litBufCapacity is the size of the underlying buffer into which literals are written, starting at address @literals. ++ * @litBufCapacity must be at least 8 bytes larger than @litSize. ++ * - @decompressedSize must be correct, and correspond to the sum of all Sequences. Any discrepancy will generate an error. ++ * @return : final compressed size, or a ZSTD error code. ++ */ ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t nbSequences, ++ const void* literals, size_t litSize, size_t litBufCapacity, ++ size_t decompressedSize); /*! ZSTD_writeSkippableFrame() : -@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFra +@@ -1425,8 +1681,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressS + * + * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number, + * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. +- * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so +- * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. ++ * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, ++ * so the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. + * + * Returns an error if destination buffer is not large enough, if the source size is not representable + * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid). +@@ -1434,26 +1690,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressS + * @return : number of bytes written or a ZSTD error. + */ + ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, unsigned magicVariant); ++ const void* src, size_t srcSize, ++ unsigned magicVariant); + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves the content of a zstd skippable frame starting at @src, and writes it to @dst buffer. + * +- * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, +- * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested +- * in the magicVariant. ++ * The parameter @magicVariant will receive the magicVariant that was supplied when the frame was written, ++ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. ++ * This can be NULL if the caller is not interested in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, ++ const void* src, size_t srcSize); + + /*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + */ +-ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); ++ZSTDLIB_STATIC_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + + + +@@ -1464,48 +1722,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFra /*! ZSTD_estimate*() : * These functions make it possible to estimate memory usage * of a future {D,C}Ctx, before its creation. @@ -891,7 +1349,23 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); /*! ZSTD_estimate?DictSize() : -@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCPar +@@ -1568,7 +1837,15 @@ typedef void (*ZSTD_freeFunction) (void + typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; + static + __attribute__((__unused__)) ++ ++#if defined(__clang__) && __clang_major__ >= 5 ++#pragma clang diagnostic push ++#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" ++#endif + ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /*< this constant defers to stdlib's functions */ ++#if defined(__clang__) && __clang_major__ >= 5 ++#pragma clang diagnostic pop ++#endif + + ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); + ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +@@ -1649,22 +1926,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCPar * This function never fails (wide contract) */ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); @@ -941,7 +1415,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, -@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP +@@ -1725,7 +2025,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP + * See the comments on that enum for an explanation of the feature. */ + #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 + +-/* Controlled with ZSTD_paramSwitch_e enum. ++/* Controlled with ZSTD_ParamSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never compress literals. + * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals +@@ -1737,11 +2037,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP */ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 @@ -953,7 +1436,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* User's best guess of source size. * Hint is not valid when srcSizeHint == 0. * There is no guarantee that hint is close to actual source size, -@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP +@@ -1808,13 +2103,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP * Experimental parameter. * Default is 0 == disabled. Set to 1 to enable. * @@ -977,7 +1460,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * * When this flag is enabled zstd won't allocate an input window buffer, * because the user guarantees it can reference the ZSTD_inBuffer until -@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP +@@ -1822,18 +2120,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also * avoid the memcpy() from the input buffer to the input window buffer. * @@ -1001,21 +1484,80 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 -@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP - * Without validation, providing a sequence that does not conform to the zstd spec will cause - * undefined behavior, and may produce a corrupted block. +@@ -1871,22 +2166,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP + /* ZSTD_c_validateSequences + * Default is 0 == disabled. Set to 1 to enable sequence validation. + * +- * For use with sequence compression API: ZSTD_compressSequences(). +- * Designates whether or not we validate sequences provided to ZSTD_compressSequences() ++ * For use with sequence compression API: ZSTD_compressSequences*(). ++ * Designates whether or not provided sequences are validated within ZSTD_compressSequences*() + * during function execution. + * +- * Without validation, providing a sequence that does not conform to the zstd spec will cause +- * undefined behavior, and may produce a corrupted block. ++ * When Sequence validation is disabled (default), Sequences are compressed as-is, ++ * so they must correct, otherwise it would result in a corruption error. * - * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for -+ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for ++ * Sequence validation adds some protection, by ensuring that all values respect boundary conditions. ++ * If a Sequence is detected invalid (see doc/zstd_compression_format.md for * specifics regarding offset/matchlength requirements) then the function will bail out and * return an error. - * -@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP +- * + */ + #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 + +-/* ZSTD_c_useBlockSplitter +- * Controlled with ZSTD_paramSwitch_e enum. ++/* ZSTD_c_blockSplitterLevel ++ * note: this parameter only influences the first splitter stage, ++ * which is active before producing the sequences. ++ * ZSTD_c_splitAfterSequences controls the next splitter stage, ++ * which is active after sequence production. ++ * Note that both can be combined. ++ * Allowed values are between 0 and ZSTD_BLOCKSPLITTER_LEVEL_MAX included. ++ * 0 means "auto", which will select a value depending on current ZSTD_c_strategy. ++ * 1 means no splitting. ++ * Then, values from 2 to 6 are sorted in increasing cpu load order. ++ * ++ * Note that currently the first block is never split, ++ * to ensure expansion guarantees in presence of incompressible data. ++ */ ++#define ZSTD_BLOCKSPLITTER_LEVEL_MAX 6 ++#define ZSTD_c_blockSplitterLevel ZSTD_c_experimentalParam20 ++ ++/* ZSTD_c_splitAfterSequences ++ * This is a stronger splitter algorithm, ++ * based on actual sequences previously produced by the selected parser. ++ * It's also slower, and as a consequence, mostly used for high compression levels. ++ * While the post-splitter does overlap with the pre-splitter, ++ * both can nonetheless be combined, ++ * notably with ZSTD_c_blockSplitterLevel at ZSTD_BLOCKSPLITTER_LEVEL_MAX, ++ * resulting in higher compression ratio than just one of them. ++ * + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use block splitter. + * Set to ZSTD_ps_enable to always use block splitter. +@@ -1894,10 +2213,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use + * block splitting based on the compression parameters. + */ +-#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13 ++#define ZSTD_c_splitAfterSequences ZSTD_c_experimentalParam13 + + /* ZSTD_c_useRowMatchFinder +- * Controlled with ZSTD_paramSwitch_e enum. ++ * Controlled with ZSTD_ParamSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use row-based matchfinder. + * Set to ZSTD_ps_enable to force usage of row-based matchfinder. +@@ -1928,6 +2247,80 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refP */ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 +/* ZSTD_c_prefetchCDictTables -+ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto. ++ * Controlled with ZSTD_ParamSwitch_e enum. Default is ZSTD_ps_auto. + * + * In some situations, zstd uses CDict tables in-place rather than copying them + * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). @@ -1059,19 +1601,21 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper + * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make + * compressBound() inaccurate). Only currently meant to be used for testing. -+ * + */ +#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 + -+/* ZSTD_c_searchForExternalRepcodes -+ * This parameter affects how zstd parses external sequences, such as sequences -+ * provided through the compressSequences() API or from an external block-level -+ * sequence producer. ++/* ZSTD_c_repcodeResolution ++ * This parameter only has an effect if ZSTD_c_blockDelimiters is ++ * set to ZSTD_sf_explicitBlockDelimiters (may change in the future). + * -+ * If set to ZSTD_ps_enable, the library will check for repeated offsets in ++ * This parameter affects how zstd parses external sequences, ++ * provided via the ZSTD_compressSequences*() API ++ * or from an external block-level sequence producer. ++ * ++ * If set to ZSTD_ps_enable, the library will check for repeated offsets within + * external sequences, even if those repcodes are not explicitly indicated in + * the "rep" field. Note that this is the only way to exploit repcode matches -+ * while using compressSequences() or an external sequence producer, since zstd ++ * while using compressSequences*() or an external sequence producer, since zstd + * currently ignores the "rep" field of external sequences. + * + * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in @@ -1080,17 +1624,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * compression ratio. + * + * The default value is ZSTD_ps_auto, for which the library will enable/disable -+ * based on compression level. -+ * -+ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is -+ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future. ++ * based on compression level (currently: level<10 disables, level>=10 enables). + */ -+#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 ++#define ZSTD_c_repcodeResolution ZSTD_c_experimentalParam19 ++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 /* older name */ ++ + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. -@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP +@@ -2084,7 +2477,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP * in the range [dst, dst + pos) MUST not be modified during decompression * or you will get data corruption. * @@ -1099,7 +1642,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * it can write directly to the ZSTD_outBuffer, but it will still allocate * an input buffer large enough to fit any compressed block. This will also * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. -@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP +@@ -2137,6 +2530,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP */ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 @@ -1133,7 +1676,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*! ZSTD_DCtx_setFormat() : * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). -@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP +@@ -2145,6 +2565,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getP * such ZSTD_f_zstd1_magicless for example. * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") @@ -1141,7 +1684,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); /*! ZSTD_decompressStream_simpleArgs() : -@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompres +@@ -2181,6 +2602,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompres * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") @@ -1149,7 +1692,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); -@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CSt +@@ -2198,17 +2620,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CSt * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") @@ -1170,7 +1713,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); * -@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_C +@@ -2218,6 +2638,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_C * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") @@ -1178,7 +1721,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, ZSTD_parameters params, -@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CS +@@ -2232,15 +2653,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CS * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") @@ -1197,7 +1740,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); * ZSTD_CCtx_refCDict(zcs, cdict); * -@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_ +@@ -2250,6 +2669,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_ * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") @@ -1205,7 +1748,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, -@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advan +@@ -2264,7 +2684,7 @@ size_t ZSTD_initCStream_usingCDict_advan * explicitly specified. * * start a new frame, using same parameters from previous frame. @@ -1214,7 +1757,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * Note that zcs must be init at least once before using ZSTD_resetCStream(). * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. -@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advan +@@ -2274,6 +2694,7 @@ size_t ZSTD_initCStream_usingCDict_advan * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") @@ -1222,7 +1765,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); -@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNo +@@ -2319,8 +2740,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNo * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); * * note: no dictionary will be used if dict == NULL or dictSize < 8 @@ -1232,7 +1775,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /*! -@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStre +@@ -2330,8 +2751,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStre * ZSTD_DCtx_refDDict(zds, ddict); * * note : ddict is referenced, it must outlive decompression session @@ -1242,7 +1785,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); /*! -@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStre +@@ -2339,18 +2760,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStre * * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); * @@ -1435,15 +1978,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* ******************************************************************* -* Buffer-less and synchronous inner streaming functions +* Buffer-less and synchronous inner streaming functions (DEPRECATED) - * --* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. --* But it's also a complex one, with several restrictions, documented below. --* Prefer normal streaming API for an easier experience. ++* +* This API is deprecated, and will be removed in a future version. +* It allows streaming (de)compression with user allocated buffers. +* However, it is hard to use, and not as well tested as the rest of +* our API. -+* + * +-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +-* But it's also a complex one, with several restrictions, documented below. +-* Prefer normal streaming API for an easier experience. +* Please use the normal streaming API instead: ZSTD_compressStream2, +* and ZSTD_decompressStream. +* If there is functionality that you need, but it doesn't provide, @@ -1451,7 +1994,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ********************************************************************* */ /* -@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStr +@@ -2358,11 +2963,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStr A ZSTD_CCtx object is required to track streaming operations. Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. @@ -1464,7 +2007,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> Then, consume your input using ZSTD_compressContinue(). There are some important considerations to keep in mind when using this advanced function : -@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStr +@@ -2380,39 +2984,49 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStr It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. @@ -1515,8 +2058,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + >0 : `srcSize` is too small, please provide at least result bytes on next attempt. errorCode, which can be tested using ZSTD_isError(). - It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, -@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_adv +- It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, ++ It fills a ZSTD_FrameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. +@@ -2428,7 +3042,7 @@ size_t ZSTD_compressBegin_usingCDict_adv The most memory efficient way is to use a round buffer of sufficient size. Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), @@ -1525,7 +2072,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, up to the moment there is not enough room left in the buffer to guarantee decoding another full block, which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. -@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_adv +@@ -2448,7 +3062,7 @@ size_t ZSTD_compressBegin_usingCDict_adv ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. @@ -1534,7 +2081,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. It can also be an error code, which can be tested with ZSTD_isError(). -@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_adv +@@ -2471,27 +3085,7 @@ size_t ZSTD_compressBegin_usingCDict_adv */ /*===== Buffer-less streaming decompression functions =====*/ @@ -1562,7 +2109,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); -@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSi +@@ -2502,6 +3096,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSi ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); /* misc */ @@ -1570,7 +2117,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); -@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e +@@ -2509,11 +3104,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e @@ -1597,7 +2144,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> Block functions produce and decode raw zstd blocks, without frame metadata. Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. -@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e +@@ -2524,7 +3131,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e - It is necessary to init context before starting + compression : any ZSTD_compressBegin*() variant, including with dictionary + decompression : any ZSTD_decompressBegin*() variant, including with dictionary @@ -1605,7 +2152,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + If input is larger than a block size, it's necessary to split input data into multiple blocks + For inputs larger than a single block, consider using regular ZSTD_compress() instead. -@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e +@@ -2541,11 +3147,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e */ /*===== Raw zstd block functions =====*/ @@ -1618,9 +2165,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ -- - #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ +- --- a/lib/zstd/Makefile +++ b/lib/zstd/Makefile @@ -1,6 +1,6 @@ @@ -1631,6 +2178,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> # All rights reserved. # # This source code is licensed under both the BSD-style license (found in the +@@ -26,6 +26,7 @@ zstd_compress-y := \ + compress/zstd_lazy.o \ + compress/zstd_ldm.o \ + compress/zstd_opt.o \ ++ compress/zstd_preSplit.o \ + + zstd_decompress-y := \ + zstd_decompress_module.o \ --- /dev/null +++ b/lib/zstd/common/allocations.h @@ -0,0 +1,56 @@ @@ -1692,7 +2247,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif /* ZSTD_ALLOCATIONS_H */ --- /dev/null +++ b/lib/zstd/common/bits.h -@@ -0,0 +1,149 @@ +@@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. @@ -1724,14 +2279,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) +{ + assert(val != 0); -+# if (__GNUC__ >= 4) -+ return (unsigned)__builtin_ctz(val); -+# else -+ return ZSTD_countTrailingZeros32_fallback(val); -+# endif ++#if (__GNUC__ >= 4) ++ return (unsigned)__builtin_ctz(val); ++#else ++ return ZSTD_countTrailingZeros32_fallback(val); ++#endif +} + -+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) { ++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) ++{ + assert(val != 0); + { + static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, @@ -1750,47 +2306,47 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) +{ + assert(val != 0); -+# if (__GNUC__ >= 4) -+ return (unsigned)__builtin_clz(val); -+# else -+ return ZSTD_countLeadingZeros32_fallback(val); -+# endif ++#if (__GNUC__ >= 4) ++ return (unsigned)__builtin_clz(val); ++#else ++ return ZSTD_countLeadingZeros32_fallback(val); ++#endif +} + +MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) +{ + assert(val != 0); -+# if (__GNUC__ >= 4) && defined(__LP64__) -+ return (unsigned)__builtin_ctzll(val); -+# else -+ { -+ U32 mostSignificantWord = (U32)(val >> 32); -+ U32 leastSignificantWord = (U32)val; -+ if (leastSignificantWord == 0) { -+ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); -+ } else { -+ return ZSTD_countTrailingZeros32(leastSignificantWord); -+ } ++#if (__GNUC__ >= 4) && defined(__LP64__) ++ return (unsigned)__builtin_ctzll(val); ++#else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (leastSignificantWord == 0) { ++ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); ++ } else { ++ return ZSTD_countTrailingZeros32(leastSignificantWord); + } -+# endif ++ } ++#endif +} + +MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) +{ + assert(val != 0); -+# if (__GNUC__ >= 4) -+ return (unsigned)(__builtin_clzll(val)); -+# else -+ { -+ U32 mostSignificantWord = (U32)(val >> 32); -+ U32 leastSignificantWord = (U32)val; -+ if (mostSignificantWord == 0) { -+ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); -+ } else { -+ return ZSTD_countLeadingZeros32(mostSignificantWord); -+ } ++#if (__GNUC__ >= 4) ++ return (unsigned)(__builtin_clzll(val)); ++#else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (mostSignificantWord == 0) { ++ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); ++ } else { ++ return ZSTD_countLeadingZeros32(mostSignificantWord); + } -+# endif ++ } ++#endif +} + +MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) @@ -1854,19 +2410,51 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -27,6 +28,7 @@ +@@ -27,7 +28,7 @@ #include "compiler.h" /* UNLIKELY() */ #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ #include "error_private.h" /* error codes and messages */ +- +#include "bits.h" /* ZSTD_highbit32 */ - /*========================================= -@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_C - /*-******************************************** + * Target specific +@@ -41,12 +42,13 @@ + /*-****************************************** + * bitStream encoding API (write forward) + ********************************************/ ++typedef size_t BitContainerType; + /* bitStream can mix input from multiple sources. + * A critical property of these streams is that they encode and decode in **reverse** direction. + * So the first bit sequence you add will be the last to be read, like a LIFO stack. + */ + typedef struct { +- size_t bitContainer; ++ BitContainerType bitContainer; + unsigned bitPos; + char* startPtr; + char* ptr; +@@ -54,7 +56,7 @@ typedef struct { + } BIT_CStream_t; + + MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity); +-MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits); ++MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); + MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC); + MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + +@@ -63,7 +65,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_C + * `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code. + * + * bits are first added to a local register. +-* Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems. ++* Local register is BitContainerType, 64-bits on 64-bits systems, or 32-bits on 32-bits systems. + * Writing data into memory is an explicit operation, performed by the flushBits function. + * Hence keep track how many bits are potentially stored into local register to avoid register overflow. + * After a flushBits, a maximum of 7 bits might still be stored into local register. +@@ -80,28 +82,28 @@ MEM_STATIC size_t BIT_closeCStream(BIT_C * bitStream decoding API (read backward) **********************************************/ -+typedef size_t BitContainerType; typedef struct { - size_t bitContainer; + BitContainerType bitContainer; @@ -1888,8 +2476,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + } BIT_DStream_status; /* result of BIT_reloadDStream() */ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); - MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); -@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(con +-MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); ++MEM_STATIC BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); + MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD); + MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + /* Start by invoking BIT_initDStream(). * A chunk of the bitStream is then stored into a local register. @@ -1898,7 +2489,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * You can then retrieve bitFields stored into the local register, **in reverse order**. * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. -@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CS +@@ -113,7 +115,7 @@ MEM_STATIC unsigned BIT_endOfDStream(con + /*-**************************************** + * unsafe API + ******************************************/ +-MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits); ++MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); + /* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */ + + MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); +@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CS MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); /* faster, but works only if nbBits >= 1 */ @@ -1932,11 +2532,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*===== Local Constants =====*/ static const unsigned BIT_mask[] = { 0, 1, 3, 7, 0xF, 0x1F, -@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CS +@@ -178,16 +153,22 @@ MEM_STATIC size_t BIT_initCStream(BIT_CS return 0; } -+FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_getLowerBits(BitContainerType bitContainer, U32 const nbBits) +{ + assert(nbBits < BIT_MASK_SIZE); + return bitContainer & BIT_mask[nbBits]; @@ -1945,7 +2545,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*! BIT_addBits() : * can add up to 31 bits into `bitC`. * Note : does not check for register overflow ! */ -@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_ + MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, +- size_t value, unsigned nbBits) ++ BitContainerType value, unsigned nbBits) + { DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); assert(nbBits < BIT_MASK_SIZE); assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); @@ -1954,7 +2557,25 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> bitC->bitPos += nbBits; } -@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DS +@@ -195,7 +176,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_ + * works only if `value` is _clean_, + * meaning all high bits above nbBits are 0 */ + MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, +- size_t value, unsigned nbBits) ++ BitContainerType value, unsigned nbBits) + { + assert((value>>nbBits) == 0); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +@@ -242,7 +223,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_C + BIT_addBitsFast(bitC, 1, 1); /* endMark */ + BIT_flushBits(bitC); + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); + } + + +@@ -266,35 +247,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DS bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); bitD->bitContainer = MEM_readLEST(bitD->ptr); { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; @@ -1998,22 +2619,30 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ } bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; -@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DS +@@ -303,12 +284,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DS return srcSize; } -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start) -+FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_getUpperBits(BitContainerType bitContainer, U32 const start) { return bitContainer >> start; } -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) -+FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) { U32 const regMask = sizeof(bitContainer)*8 - 1; /* if start > regMask, bitstream is corrupted, and result is undefined */ -@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_ +@@ -318,26 +299,20 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_ + * such cpus old (pre-Haswell, 2013) and their performance is not of that + * importance. + */ +-#if defined(__x86_64__) || defined(_M_X86) ++#if defined(__x86_64__) || defined(_M_X64) + return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1); + #else + return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; #endif } @@ -2030,11 +2659,19 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * On 64-bits, maxNbBits==56. * @return : value extracted */ -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) -+FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) { /* arbitrate between double-shift and shift+mask */ #if 1 -@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const +@@ -353,14 +328,14 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT + + /*! BIT_lookBitsFast() : + * unsafe version; only works if nbBits >= 1 */ +-MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) ++MEM_STATIC BitContainerType BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) + { + U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; + assert(nbBits >= 1); return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); } @@ -2043,25 +2680,29 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { bitD->bitsConsumed += nbBits; } -@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_sk +@@ -369,23 +344,38 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_sk * Read (consume) next n bits from local register and update. * Pay attention to not read more than nbBits contained into local register. * @return : extracted value. */ -MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) -+FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) { - size_t const value = BIT_lookBits(bitD, nbBits); +- size_t const value = BIT_lookBits(bitD, nbBits); ++ BitContainerType const value = BIT_lookBits(bitD, nbBits); BIT_skipBits(bitD, nbBits); -@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_ + return value; } /*! BIT_readBitsFast() : - * unsafe version; only works only if nbBits >= 1 */ +-MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + * unsafe version; only works if nbBits >= 1 */ - MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) ++MEM_STATIC BitContainerType BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) { - size_t const value = BIT_lookBitsFast(bitD, nbBits); -@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_D +- size_t const value = BIT_lookBitsFast(bitD, nbBits); ++ BitContainerType const value = BIT_lookBitsFast(bitD, nbBits); + assert(nbBits >= 1); + BIT_skipBits(bitD, nbBits); return value; } @@ -2083,7 +2724,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*! BIT_reloadDStreamFast() : * Similar to BIT_reloadDStream(), but with two differences: * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! -@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reload +@@ -396,31 +386,35 @@ MEM_STATIC BIT_DStream_status BIT_reload { if (UNLIKELY(bitD->ptr < bitD->limitPtr)) return BIT_DStream_overflow; @@ -2129,6 +2770,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { U32 nbBytes = bitD->bitsConsumed >> 3; BIT_DStream_status result = BIT_DStream_unfinished; if (bitD->ptr - nbBytes < bitD->start) { +@@ -442,5 +436,4 @@ MEM_STATIC unsigned BIT_endOfDStream(con + return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8)); + } + +- + #endif /* BITSTREAM_H_MODULE */ --- a/lib/zstd/common/compiler.h +++ b/lib/zstd/common/compiler.h @@ -1,5 +1,6 @@ @@ -2227,7 +2874,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* vectorization * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, -@@ -126,9 +143,9 @@ +@@ -126,16 +143,13 @@ #define UNLIKELY(x) (__builtin_expect((x), 0)) #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) @@ -2239,7 +2886,41 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #endif /* disable warnings */ -@@ -179,6 +196,85 @@ + +-/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/ +- +- + /* compile time determination of SIMD support */ + + /* C-language Attributes are added in C23. */ +@@ -158,9 +172,15 @@ + #define ZSTD_FALLTHROUGH fallthrough + + /*-************************************************************** +-* Alignment check ++* Alignment + *****************************************************************/ + ++/* @return 1 if @u is a 2^n value, 0 otherwise ++ * useful to check a value is valid for alignment restrictions */ ++MEM_STATIC int ZSTD_isPower2(size_t u) { ++ return (u & (u-1)) == 0; ++} ++ + /* this test was initially positioned in mem.h, + * but this file is removed (or replaced) for linux kernel + * so it's now hosted in compiler.h, +@@ -175,10 +195,95 @@ + + #endif /* ZSTD_ALIGNOF */ + ++#ifndef ZSTD_ALIGNED ++/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ ++#define ZSTD_ALIGNED(a) __attribute__((aligned(a))) ++#endif /* ZSTD_ALIGNED */ ++ ++ + /*-************************************************************** * Sanitizer *****************************************************************/ @@ -2262,7 +2943,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif + +/* -+ * Helper function to perform a wrapped pointer difference without trigging ++ * Helper function to perform a wrapped pointer difference without triggering + * UBSAN. + * + * @returns lhs - rhs with wrapping @@ -2370,7 +3051,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable +@@ -33,7 +34,6 @@ + #define DEBUG_H_12987983217 + + +- + /* static assert is triggered at compile time, leaving no runtime artefact. + * static assert only works with compile-time constants. + * Also, this variant can only be used inside a function. */ +@@ -82,20 +82,27 @@ extern int g_debuglevel; /* the variable It's useful when enabling very verbose levels on selective conditions (such as position in src) */ @@ -2408,7 +3097,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +# define DEBUGLOG(l, ...) do { } while (0) /* disabled */ #endif - +- +- + #endif /* DEBUG_H_12987983217 */ --- a/lib/zstd/common/entropy_common.c +++ b/lib/zstd/common/entropy_common.c @@ -1,6 +1,7 @@ @@ -2548,10 +3239,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; case PREFIX(init_missing): return "Context should be init first"; case PREFIX(memory_allocation): return "Allocation error : not enough memory"; -@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum +@@ -38,17 +41,23 @@ const char* ERR_getErrorString(ERR_enum case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; ++ case PREFIX(cannotProduce_uncompressedBlock): return "This mode cannot generate an uncompressed block"; + case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; case PREFIX(dictionary_wrong): return "Dictionary mismatch"; @@ -2581,7 +3273,24 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t c +@@ -13,8 +14,6 @@ + #ifndef ERROR_H_MODULE + #define ERROR_H_MODULE + +- +- + /* **************************************** + * Dependencies + ******************************************/ +@@ -23,7 +22,6 @@ + #include "debug.h" + #include "zstd_deps.h" /* size_t */ + +- + /* **************************************** + * Compiler-specific + ******************************************/ +@@ -49,8 +47,13 @@ ERR_STATIC unsigned ERR_isError(size_t c ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } /* check and forward error code */ @@ -2597,7 +3306,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*-**************************************** -@@ -84,10 +90,12 @@ void _force_has_format_string(const char +@@ -84,10 +87,12 @@ void _force_has_format_string(const char * We want to force this function invocation to be syntactically correct, but * we don't want to force runtime evaluation of its arguments. */ @@ -2614,7 +3323,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #define ERR_QUOTE(str) #str -@@ -98,48 +106,50 @@ void _force_has_format_string(const char +@@ -98,48 +103,49 @@ void _force_has_format_string(const char * In order to do that (particularly, printing the conditional that failed), * this can't just wrap RETURN_ERROR(). */ @@ -2680,6 +3389,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - return err_code; \ - } \ - } while(0); +- +#define FORWARD_IF_ERROR(err, ...) \ + do { \ + size_t const err_code = (err); \ @@ -2693,7 +3403,6 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + } \ + } while(0) - #endif /* ERROR_H_MODULE */ --- a/lib/zstd/common/fse.h +++ b/lib/zstd/common/fse.h @@ -2707,7 +3416,24 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -50,34 +51,6 @@ +@@ -11,8 +12,6 @@ + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + ****************************************************************** */ +- +- + #ifndef FSE_H + #define FSE_H + +@@ -22,7 +21,6 @@ + ******************************************/ + #include "zstd_deps.h" /* size_t, ptrdiff_t */ + +- + /*-***************************************** + * FSE_PUBLIC_API : control library symbols visibility + ******************************************/ +@@ -50,34 +48,6 @@ FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */ @@ -2742,7 +3468,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*-***************************************** * Tool functions ******************************************/ -@@ -89,20 +62,6 @@ FSE_PUBLIC_API const char* FSE_getErrorN +@@ -89,20 +59,6 @@ FSE_PUBLIC_API const char* FSE_getErrorN /*-***************************************** @@ -2763,7 +3489,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * FSE detailed API ******************************************/ /*! -@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (v +@@ -161,8 +117,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (v /*! Constructor and Destructor of FSE_CTable. Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ @@ -2772,7 +3498,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*! FSE_buildCTable(): Builds `ct`, which must be already allocated, using FSE_createCTable(). -@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi +@@ -238,23 +192,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize, int bmi2); @@ -2796,15 +3522,22 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*! Tutorial : -@@ -286,6 +227,7 @@ If there is an error, the function will +@@ -286,13 +224,11 @@ If there is an error, the function will #endif /* FSE_H */ + #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY) #define FSE_H_FSE_STATIC_LINKING_ONLY +- +-/* *** Dependency *** */ + #include "bitstream.h" -@@ -317,16 +259,6 @@ If there is an error, the function will +- + /* ***************************************** + * Static allocation + *******************************************/ +@@ -317,16 +253,6 @@ If there is an error, the function will unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); /*< same as FSE_optimalTableLog(), which used `minus==2` */ @@ -2821,7 +3554,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); /*< build a fake FSE_CTable, designed to compress always the same symbolValue */ -@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* +@@ -344,19 +270,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ @@ -2844,19 +3577,19 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> typedef enum { FSE_repeat_none, /*< Cannot use the previous table */ -@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CSt +@@ -539,20 +457,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CSt FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; const U16* const stateTable = (const U16*)(statePtr->stateTable); U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); - BIT_addBits(bitC, statePtr->value, nbBitsOut); -+ BIT_addBits(bitC, (size_t)statePtr->value, nbBitsOut); ++ BIT_addBits(bitC, (BitContainerType)statePtr->value, nbBitsOut); statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; } MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) { - BIT_addBits(bitC, statePtr->value, statePtr->stateLog); -+ BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog); ++ BIT_addBits(bitC, (BitContainerType)statePtr->value, statePtr->stateLog); BIT_flushBits(bitC); } @@ -2868,6 +3601,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * note 1 : assume symbolValue is valid (<= maxSymbolValue) * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +@@ -705,7 +623,4 @@ MEM_STATIC unsigned FSE_endOfDState(cons + + #define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3) + +- + #endif /* FSE_STATIC_LINKING_ONLY */ +- +- --- a/lib/zstd/common/fse_decompress.c +++ b/lib/zstd/common/fse_decompress.c @@ -1,6 +1,7 @@ @@ -3007,7 +3748,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic( void* dst, size_t maxDstSize, -@@ -287,32 +230,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr +@@ -248,6 +191,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr + FSE_initDState(&state1, &bitD, dt); + FSE_initDState(&state2, &bitD, dt); + ++ RETURN_ERROR_IF(BIT_reloadDStream(&bitD)==BIT_DStream_overflow, corruption_detected, ""); ++ + #define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD) + + /* 4 symbols per loop */ +@@ -287,32 +232,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr break; } } @@ -3042,7 +3792,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } FSE_DecompressWksp; -@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr +@@ -327,13 +252,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr unsigned tableLog; unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; @@ -3064,7 +3814,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (FSE_isError(NCountLength)) return NCountLength; if (tableLog > maxLog) return ERROR(tableLog_tooLarge); assert(NCountLength <= cSrcSize); -@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr +@@ -342,19 +272,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompr } if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); @@ -3090,7 +3840,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } -@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* ds +@@ -382,9 +313,4 @@ size_t FSE_decompress_wksp_bmi2(void* ds return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); } @@ -3112,7 +3862,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -18,99 +19,22 @@ +@@ -12,105 +13,26 @@ + * You may select, at your option, one of the above-listed licenses. + ****************************************************************** */ + +- + #ifndef HUF_H_298734234 + #define HUF_H_298734234 /* *** Dependencies *** */ #include "zstd_deps.h" /* size_t */ @@ -3161,11 +3917,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - */ -HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, - const void* cSrc, size_t cSrcSize); +- +#include "mem.h" /* U32 */ +#define FSE_STATIC_LINKING_ONLY +#include "fse.h" - /* *** Tool functions *** */ -#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ -HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ @@ -3219,7 +3975,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* *** Constants *** */ #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ -@@ -151,25 +75,49 @@ typedef U32 HUF_DTable; +@@ -151,25 +73,49 @@ typedef U32 HUF_DTable; /* **************************************** * Advanced decompression functions ******************************************/ @@ -3282,7 +4038,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*! HUF_compress() does the following: * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") -@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_D +@@ -182,12 +128,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_D * For example, it's possible to compress several blocks using the same 'CTable', * or to save and regenerate 'CTable' using external methods. */ @@ -3300,7 +4056,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); -@@ -196,6 +144,7 @@ typedef enum { +@@ -196,6 +142,7 @@ typedef enum { HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */ } HUF_repeat; @@ -3308,7 +4064,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* HUF_compress4X_repeat() : * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. * If it uses hufTable it does not modify hufTable or repeat. -@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, +@@ -206,13 +153,13 @@ size_t HUF_compress4X_repeat(void* dst, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ @@ -3324,7 +4080,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, -@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeig +@@ -238,7 +185,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeig U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, const void* src, size_t srcSize, void* workspace, size_t wkspSize, @@ -3333,7 +4089,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* HUF_readCTable() : * Loading a CTable saved with HUF_writeCTable() */ -@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, +@@ -246,9 +193,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, /* HUF_getNbBitsFromCTable() : * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX @@ -3357,7 +4113,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* * HUF_decompress() does the following: * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics -@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, s +@@ -276,32 +236,12 @@ U32 HUF_selectDecoder (size_t dstSize, s #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) @@ -3391,13 +4147,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* HUF_compress1X_repeat() : * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. * If it uses hufTable it does not modify hufTable or repeat. -@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst, +@@ -312,47 +252,27 @@ size_t HUF_compress1X_repeat(void* dst, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); -+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); - +- -size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ -#ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ @@ -3409,17 +4164,18 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ -size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ -#endif -+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); - #ifndef HUF_FORCE_DECOMPRESS_X1 +-#ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ -size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ -#endif -- ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + -size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */ -#ifndef HUF_FORCE_DECOMPRESS_X2 -size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); -#endif --#ifndef HUF_FORCE_DECOMPRESS_X1 ++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */ #endif @@ -3447,8 +4203,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #endif -#endif /* HUF_STATIC_LINKING_ONLY */ +- +#endif /* HUF_H_298734234 */ - --- a/lib/zstd/common/mem.h +++ b/lib/zstd/common/mem.h @@ -1,6 +1,6 @@ @@ -3486,7 +4242,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * This header is shared between C and ASM code, so it MUST only * contain macro definitions. It MUST not contain any C code. * -@@ -45,6 +46,8 @@ +@@ -45,30 +46,35 @@ /* Mark the internal assembly functions as hidden */ #ifdef __ELF__ # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func @@ -3495,16 +4251,42 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #else # define ZSTD_HIDE_ASM_FUNCTION(func) #endif -@@ -65,7 +68,7 @@ + ++/* Compile time determination of BMI2 support */ ++ ++ + /* Enable runtime BMI2 dispatch based on the CPU. + * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. + */ + #ifndef DYNAMIC_BMI2 +- #if ((defined(__clang__) && __has_attribute(__target__)) \ ++# if ((defined(__clang__) && __has_attribute(__target__)) \ + || (defined(__GNUC__) \ + && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ +- && (defined(__x86_64__) || defined(_M_X64)) \ ++ && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)) \ + && !defined(__BMI2__) +- # define DYNAMIC_BMI2 1 +- #else +- # define DYNAMIC_BMI2 0 +- #endif ++# define DYNAMIC_BMI2 1 ++# else ++# define DYNAMIC_BMI2 0 ++# endif #endif /* - * Only enable assembly for GNUC comptabile compilers, -+ * Only enable assembly for GNUC compatible compilers, ++ * Only enable assembly for GNU C compatible compilers, * because other platforms may not support GAS assembly syntax. * - * Only enable assembly for Linux / MacOS, other platforms may -@@ -90,4 +93,23 @@ +- * Only enable assembly for Linux / MacOS, other platforms may ++ * Only enable assembly for Linux / MacOS / Win32, other platforms may + * work, but they haven't been tested. This could likely be + * extended to BSD systems. + * +@@ -90,4 +96,23 @@ */ #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 @@ -3622,7 +4404,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -28,7 +29,6 @@ +@@ -28,12 +29,10 @@ #include <linux/zstd.h> #define FSE_STATIC_LINKING_ONLY #include "fse.h" @@ -3630,7 +4412,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #include "huf.h" #include <linux/xxhash.h> /* XXH_reset, update, digest */ #define ZSTD_TRACE 0 -@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compre + +- + /* ---- static assert (debug) --- */ + #define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) + #define ZSTD_isError ERR_isError /* for inlining */ +@@ -83,16 +82,17 @@ typedef enum { bt_raw, bt_rle, bt_compre #define ZSTD_FRAMECHECKSUMSIZE 4 #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ @@ -3639,10 +4426,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#define MIN_LITERALS_FOR_4_STREAMS 6 -#define HufLog 12 - typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; +-typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; ++typedef enum { set_basic, set_rle, set_compressed, set_repeat } SymbolEncodingType_e; #define LONGNBSEQ 0x7F00 -@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_c + #define MINMATCH 3 #define Litbits 8 @@ -3650,7 +4438,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #define MaxLit ((1<<Litbits) - 1) #define MaxML 52 #define MaxLL 35 -@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_c +@@ -103,6 +103,8 @@ typedef enum { set_basic, set_rle, set_c #define LLFSELog 9 #define OffFSELog 8 #define MaxFSELog MAX(MAX(MLFSELog, LLFSELog), OffFSELog) @@ -3659,7 +4447,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */ /* Each table cannot take more than #symbols * FSELog bits */ -@@ -166,7 +169,7 @@ static void ZSTD_copy8(void* dst, const +@@ -166,7 +168,7 @@ static void ZSTD_copy8(void* dst, const ZSTD_memcpy(dst, src, 8); #endif } @@ -3668,7 +4456,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Need to use memmove here since the literal buffer can now be located within the dst buffer. In circumstances where the op "catches up" to where the -@@ -186,7 +189,7 @@ static void ZSTD_copy16(void* dst, const +@@ -186,7 +188,7 @@ static void ZSTD_copy16(void* dst, const ZSTD_memcpy(dst, copy16_buf, 16); #endif } @@ -3677,7 +4465,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #define WILDCOPY_OVERLENGTH 32 #define WILDCOPY_VECLEN 16 -@@ -215,7 +218,7 @@ void ZSTD_wildcopy(void* dst, const void +@@ -215,7 +217,7 @@ void ZSTD_wildcopy(void* dst, const void if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { /* Handle short offset copies. */ do { @@ -3686,7 +4474,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } while (op < oend); } else { assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); -@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void +@@ -225,12 +227,6 @@ void ZSTD_wildcopy(void* dst, const void * one COPY16() in the first call. Then, do two calls per loop since * at that point it is more likely to have a high trip count. */ @@ -3699,7 +4487,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_copy16(op, ip); if (16 >= length) return; op += 16; -@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void +@@ -240,7 +236,6 @@ void ZSTD_wildcopy(void* dst, const void COPY16(op, ip); } while (op < oend); @@ -3707,48 +4495,70 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } -@@ -289,11 +285,11 @@ typedef enum { - typedef struct { - seqDef* sequencesStart; - seqDef* sequences; /* ptr to end of sequences */ +@@ -273,62 +268,6 @@ typedef enum { + /*-******************************************* + * Private declarations + *********************************************/ +-typedef struct seqDef_s { +- U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */ +- U16 litLength; +- U16 mlBase; /* mlBase == matchLength - MINMATCH */ +-} seqDef; +- +-/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */ +-typedef enum { +- ZSTD_llt_none = 0, /* no longLengthType */ +- ZSTD_llt_literalLength = 1, /* represents a long literal */ +- ZSTD_llt_matchLength = 2 /* represents a long match */ +-} ZSTD_longLengthType_e; +- +-typedef struct { +- seqDef* sequencesStart; +- seqDef* sequences; /* ptr to end of sequences */ - BYTE* litStart; - BYTE* lit; /* ptr to end of literals */ - BYTE* llCode; - BYTE* mlCode; - BYTE* ofCode; -+ BYTE* litStart; -+ BYTE* lit; /* ptr to end of literals */ -+ BYTE* llCode; -+ BYTE* mlCode; -+ BYTE* ofCode; - size_t maxNbSeq; - size_t maxNbLit; - -@@ -301,8 +297,8 @@ typedef struct { - * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment - * the existing value of the litLength or matchLength by 0x10000. - */ +- size_t maxNbSeq; +- size_t maxNbLit; +- +- /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength +- * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment +- * the existing value of the litLength or matchLength by 0x10000. +- */ - ZSTD_longLengthType_e longLengthType; - U32 longLengthPos; /* Index of the sequence to apply long length modification to */ -+ ZSTD_longLengthType_e longLengthType; -+ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ - } seqStore_t; - - typedef struct { -@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getS - seqLen.matchLength = seq->mlBase + MINMATCH; - if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { - if (seqStore->longLengthType == ZSTD_llt_literalLength) { +-} seqStore_t; +- +-typedef struct { +- U32 litLength; +- U32 matchLength; +-} ZSTD_sequenceLength; +- +-/* +- * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences +- * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. +- */ +-MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq) +-{ +- ZSTD_sequenceLength seqLen; +- seqLen.litLength = seq->litLength; +- seqLen.matchLength = seq->mlBase + MINMATCH; +- if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { +- if (seqStore->longLengthType == ZSTD_llt_literalLength) { - seqLen.litLength += 0xFFFF; -+ seqLen.litLength += 0x10000; - } - if (seqStore->longLengthType == ZSTD_llt_matchLength) { +- } +- if (seqStore->longLengthType == ZSTD_llt_matchLength) { - seqLen.matchLength += 0xFFFF; -+ seqLen.matchLength += 0x10000; - } - } - return seqLen; -@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getS +- } +- } +- return seqLen; +-} + + /* + * Contains the compressed frame size and an upper-bound for the decompressed frame size. +@@ -337,74 +276,11 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getS * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` */ typedef struct { @@ -3757,7 +4567,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> unsigned long long decompressedBound; } ZSTD_frameSizeInfo; /* decompress & legacy */ - const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ -void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ - -/* custom memory allocation functions */ @@ -3819,11 +4629,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -# endif - } -} -+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ - - +- +- /* ZSTD_invalidateRepCodes() : -@@ -420,13 +357,13 @@ typedef struct { + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; +@@ -420,13 +296,13 @@ typedef struct { /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ @@ -3839,6 +4650,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize); +@@ -439,5 +315,4 @@ MEM_STATIC int ZSTD_cpuSupportsBmi2(void + return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid); + } + +- + #endif /* ZSTD_CCOMMON_H_MODULE */ --- a/lib/zstd/compress/clevels.h +++ b/lib/zstd/compress/clevels.h @@ -1,5 +1,6 @@ @@ -4028,6 +4845,23 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -26,6 +27,16 @@ unsigned HIST_isError(size_t code) { ret + /*-************************************************************** + * Histogram functions + ****************************************************************/ ++void HIST_add(unsigned* count, const void* src, size_t srcSize) ++{ ++ const BYTE* ip = (const BYTE*)src; ++ const BYTE* const end = ip + srcSize; ++ ++ while (ip<end) { ++ count[*ip++]++; ++ } ++} ++ + unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize) + { --- a/lib/zstd/compress/hist.h +++ b/lib/zstd/compress/hist.h @@ -1,7 +1,8 @@ @@ -4040,6 +4874,17 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -73,3 +74,10 @@ size_t HIST_countFast_wksp(unsigned* cou + */ + unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize); ++ ++/*! HIST_add() : ++ * Lowest level: just add nb of occurrences of characters from @src into @count. ++ * @count is not reset. @count array is presumed large enough (i.e. 1 KB). ++ @ This function does not need any additional stack memory. ++ */ ++void HIST_add(unsigned* count, const void* src, size_t srcSize); --- a/lib/zstd/compress/huf_compress.c +++ b/lib/zstd/compress/huf_compress.c @@ -1,6 +1,7 @@ @@ -4919,13 +5764,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -11,12 +12,12 @@ +@@ -11,12 +12,13 @@ /*-************************************* * Dependencies ***************************************/ +#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ #include "../common/mem.h" ++#include "../common/error_private.h" #include "hist.h" /* HIST_countFast_wksp */ #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ #include "../common/fse.h" @@ -4933,7 +5779,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #include "../common/huf.h" #include "zstd_compress_internal.h" #include "zstd_compress_sequences.h" -@@ -27,6 +28,7 @@ +@@ -27,6 +29,7 @@ #include "zstd_opt.h" #include "zstd_ldm.h" #include "zstd_compress_superblock.h" @@ -4941,7 +5787,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* *************************************************************** * Tuning parameters -@@ -55,14 +57,17 @@ +@@ -44,7 +47,7 @@ + * in log format, aka 17 => 1 << 17 == 128Ki positions. + * This structure is only used in zstd_opt. + * Since allocation is centralized for all strategies, it has to be known here. +- * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3, ++ * The actual (selected) size of the hash table is then stored in ZSTD_MatchState_t.hashLog3, + * so that zstd_opt.c doesn't need to know about this constant. + */ + #ifndef ZSTD_HASHLOG3_MAX +@@ -55,14 +58,17 @@ * Helper functions ***************************************/ /* ZSTD_compressBound() @@ -4965,7 +5820,38 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CC +@@ -75,12 +81,12 @@ struct ZSTD_CDict_s { + ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */ + U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ + ZSTD_cwksp workspace; +- ZSTD_matchState_t matchState; ++ ZSTD_MatchState_t matchState; + ZSTD_compressedBlockState_t cBlockState; + ZSTD_customMem customMem; + U32 dictID; + int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ +- ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use ++ ZSTD_ParamSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use + * row-based matchfinder. Unless the cdict is reloaded, we will use + * the same greedy/lazy matchfinder at compression time. + */ +@@ -130,11 +136,12 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void* wor + ZSTD_cwksp_move(&cctx->workspace, &ws); + cctx->staticSize = workspaceSize; + +- /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */ +- if (!ZSTD_cwksp_check_available(&cctx->workspace, ENTROPY_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; ++ /* statically sized space. tmpWorkspace never moves (but prev/next block swap places) */ ++ if (!ZSTD_cwksp_check_available(&cctx->workspace, TMP_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; + cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); + cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); +- cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, ENTROPY_WORKSPACE_SIZE); ++ cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE); ++ cctx->tmpWkspSize = TMP_WORKSPACE_SIZE; + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + return cctx; + } +@@ -168,15 +175,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CC size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) { @@ -4984,7 +5870,62 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } return 0; } -@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const +@@ -205,7 +210,7 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CS + } + + /* private API call, for dictBuilder only */ +-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } ++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } + + /* Returns true if the strategy supports using a row based matchfinder */ + static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { +@@ -215,32 +220,23 @@ static int ZSTD_rowMatchFinderSupported( + /* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder + * for this compression. + */ +-static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) { ++static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_ParamSwitch_e mode) { + assert(mode != ZSTD_ps_auto); + return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable); + } + + /* Returns row matchfinder usage given an initial mode and cParams */ +-static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode, ++static ZSTD_ParamSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +-#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON) +- int const kHasSIMD128 = 1; +-#else +- int const kHasSIMD128 = 0; +-#endif + if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ + mode = ZSTD_ps_disable; + if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; +- if (kHasSIMD128) { +- if (cParams->windowLog > 14) mode = ZSTD_ps_enable; +- } else { +- if (cParams->windowLog > 17) mode = ZSTD_ps_enable; +- } ++ if (cParams->windowLog > 14) mode = ZSTD_ps_enable; + return mode; + } + + /* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */ +-static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode, ++static ZSTD_ParamSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable; +@@ -248,7 +244,7 @@ static ZSTD_paramSwitch_e ZSTD_resolveBl + + /* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ + static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, +- const ZSTD_paramSwitch_e useRowMatchFinder, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, + const U32 forDDSDict) { + assert(useRowMatchFinder != ZSTD_ps_auto); + /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. +@@ -257,16 +253,44 @@ static int ZSTD_allocateChainTable(const return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); } @@ -4994,9 +5935,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - * Returns 0 otherwise. + * Returns ZSTD_ps_disable otherwise. */ - static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, +-static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, ++static ZSTD_ParamSwitch_e ZSTD_resolveEnableLdm(ZSTD_ParamSwitch_e mode, const ZSTD_compressionParameters* const cParams) { -@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEn + if (mode != ZSTD_ps_auto) return mode; return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; } @@ -5013,7 +5955,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + } +} + -+static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) { ++static ZSTD_ParamSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_ParamSwitch_e value, int cLevel) { + if (value != ZSTD_ps_auto) return value; + if (cLevel < 10) { + return ZSTD_ps_disable; @@ -5031,9 +5973,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( ZSTD_compressionParameters cParams) { -@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxPar +@@ -282,8 +306,12 @@ static ZSTD_CCtx_params ZSTD_makeCCtxPar + assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog); + assert(cctxParams.ldmParams.hashRateLog < 32); } - cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); +- cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); ++ cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams); cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); + cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); + cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); @@ -5042,7 +5987,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> assert(!ZSTD_checkCParams(cParams)); return cctxParams; } -@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_pa +@@ -329,10 +357,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_pa #define ZSTD_NO_CLEVEL 0 /* @@ -5058,17 +6003,23 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { assert(!ZSTD_checkCParams(params->cParams)); ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); -@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_interna +@@ -343,10 +374,13 @@ static void ZSTD_CCtxParams_init_interna + */ + cctxParams->compressionLevel = compressionLevel; cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); - cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); +- cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); ++ cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, ¶ms->cParams); cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); + cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); + cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); + cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", - cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); +- cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); ++ cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm); } -@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZST + + size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) +@@ -359,7 +393,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZST /* * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. @@ -5077,7 +6028,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ static void ZSTD_CCtxParams_setZstdParams( ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) -@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_c +@@ -455,8 +489,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_c return bounds; case ZSTD_c_enableLongDistanceMatching: @@ -5088,7 +6039,25 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return bounds; case ZSTD_c_ldmHashLog: -@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_c +@@ -534,11 +568,16 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_c + bounds.upperBound = 1; + return bounds; + +- case ZSTD_c_useBlockSplitter: ++ case ZSTD_c_splitAfterSequences: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + ++ case ZSTD_c_blockSplitterLevel: ++ bounds.lowerBound = 0; ++ bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX; ++ return bounds; ++ + case ZSTD_c_useRowMatchFinder: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; +@@ -549,6 +588,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_c bounds.upperBound = 1; return bounds; @@ -5107,7 +6076,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + bounds.upperBound = ZSTD_BLOCKSIZE_MAX; + return bounds; + -+ case ZSTD_c_searchForExternalRepcodes: ++ case ZSTD_c_repcodeResolution: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; @@ -5115,7 +6084,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> default: bounds.error = ERROR(parameter_unsupported); return bounds; -@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZS +@@ -567,10 +626,11 @@ static size_t ZSTD_cParam_clampBounds(ZS return 0; } @@ -5131,18 +6100,30 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) -@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_ - case ZSTD_c_useBlockSplitter: +@@ -584,6 +644,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_ + case ZSTD_c_minMatch: + case ZSTD_c_targetLength: + case ZSTD_c_strategy: ++ case ZSTD_c_blockSplitterLevel: + return 1; + + case ZSTD_c_format: +@@ -610,9 +671,13 @@ static int ZSTD_isUpdateAuthorized(ZSTD_ + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: +- case ZSTD_c_useBlockSplitter: ++ case ZSTD_c_splitAfterSequences: case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: + case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableSeqProducerFallback: + case ZSTD_c_maxBlockSize: -+ case ZSTD_c_searchForExternalRepcodes: ++ case ZSTD_c_repcodeResolution: default: return 0; } -@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* +@@ -625,7 +690,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* if (ZSTD_isUpdateAuthorized(param)) { cctx->cParamsChanged = 1; } else { @@ -5151,14 +6132,19 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } switch(param) -@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* - case ZSTD_c_useBlockSplitter: +@@ -665,9 +730,14 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: +- case ZSTD_c_useBlockSplitter: ++ case ZSTD_c_splitAfterSequences: ++ case ZSTD_c_blockSplitterLevel: case ZSTD_c_useRowMatchFinder: case ZSTD_c_deterministicRefPrefix: + case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableSeqProducerFallback: + case ZSTD_c_maxBlockSize: -+ case ZSTD_c_searchForExternalRepcodes: ++ case ZSTD_c_repcodeResolution: break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); @@ -5208,8 +6194,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } case ZSTD_c_literalCompressionMode : { - const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; +- const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; - BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); ++ const ZSTD_ParamSwitch_e lcm = (ZSTD_ParamSwitch_e)value; + BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); CCtxParams->literalCompressionMode = lcm; return CCtxParams->literalCompressionMode; @@ -5222,8 +6209,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + return (size_t)CCtxParams->enableDedicatedDictSearch; case ZSTD_c_enableLongDistanceMatching : +- CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; + BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); - CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; ++ CCtxParams->ldmParams.enableLdm = (ZSTD_ParamSwitch_e)value; return CCtxParams->ldmParams.enableLdm; case ZSTD_c_ldmHashLog : @@ -5273,16 +6261,40 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> case ZSTD_c_stableInBuffer: BOUNDCHECK(ZSTD_c_stableInBuffer, value); -@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD +@@ -843,28 +916,55 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD + + case ZSTD_c_blockDelimiters: + BOUNDCHECK(ZSTD_c_blockDelimiters, value); +- CCtxParams->blockDelimiters = (ZSTD_sequenceFormat_e)value; ++ CCtxParams->blockDelimiters = (ZSTD_SequenceFormat_e)value; + return CCtxParams->blockDelimiters; + case ZSTD_c_validateSequences: BOUNDCHECK(ZSTD_c_validateSequences, value); CCtxParams->validateSequences = value; - return CCtxParams->validateSequences; + return (size_t)CCtxParams->validateSequences; - case ZSTD_c_useBlockSplitter: - BOUNDCHECK(ZSTD_c_useBlockSplitter, value); -@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD +- case ZSTD_c_useBlockSplitter: +- BOUNDCHECK(ZSTD_c_useBlockSplitter, value); +- CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value; +- return CCtxParams->useBlockSplitter; ++ case ZSTD_c_splitAfterSequences: ++ BOUNDCHECK(ZSTD_c_splitAfterSequences, value); ++ CCtxParams->postBlockSplitter = (ZSTD_ParamSwitch_e)value; ++ return CCtxParams->postBlockSplitter; ++ ++ case ZSTD_c_blockSplitterLevel: ++ BOUNDCHECK(ZSTD_c_blockSplitterLevel, value); ++ CCtxParams->preBlockSplitter_level = value; ++ return (size_t)CCtxParams->preBlockSplitter_level; + + case ZSTD_c_useRowMatchFinder: + BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); +- CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value; ++ CCtxParams->useRowMatchFinder = (ZSTD_ParamSwitch_e)value; + return CCtxParams->useRowMatchFinder; + case ZSTD_c_deterministicRefPrefix: BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); CCtxParams->deterministicRefPrefix = !!value; @@ -5291,7 +6303,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + + case ZSTD_c_prefetchCDictTables: + BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); -+ CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value; ++ CCtxParams->prefetchCDictTables = (ZSTD_ParamSwitch_e)value; + return CCtxParams->prefetchCDictTables; + + case ZSTD_c_enableSeqProducerFallback: @@ -5302,17 +6314,100 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + case ZSTD_c_maxBlockSize: + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_maxBlockSize, value); -+ CCtxParams->maxBlockSize = value; ++ assert(value>=0); ++ CCtxParams->maxBlockSize = (size_t)value; + return CCtxParams->maxBlockSize; + -+ case ZSTD_c_searchForExternalRepcodes: -+ BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value); -+ CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value; ++ case ZSTD_c_repcodeResolution: ++ BOUNDCHECK(ZSTD_c_repcodeResolution, value); ++ CCtxParams->searchForExternalRepcodes = (ZSTD_ParamSwitch_e)value; + return CCtxParams->searchForExternalRepcodes; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } -@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter( +@@ -881,7 +981,7 @@ size_t ZSTD_CCtxParams_getParameter( + switch(param) + { + case ZSTD_c_format : +- *value = CCtxParams->format; ++ *value = (int)CCtxParams->format; + break; + case ZSTD_c_compressionLevel : + *value = CCtxParams->compressionLevel; +@@ -896,16 +996,16 @@ size_t ZSTD_CCtxParams_getParameter( + *value = (int)CCtxParams->cParams.chainLog; + break; + case ZSTD_c_searchLog : +- *value = CCtxParams->cParams.searchLog; ++ *value = (int)CCtxParams->cParams.searchLog; + break; + case ZSTD_c_minMatch : +- *value = CCtxParams->cParams.minMatch; ++ *value = (int)CCtxParams->cParams.minMatch; + break; + case ZSTD_c_targetLength : +- *value = CCtxParams->cParams.targetLength; ++ *value = (int)CCtxParams->cParams.targetLength; + break; + case ZSTD_c_strategy : +- *value = (unsigned)CCtxParams->cParams.strategy; ++ *value = (int)CCtxParams->cParams.strategy; + break; + case ZSTD_c_contentSizeFlag : + *value = CCtxParams->fParams.contentSizeFlag; +@@ -920,10 +1020,10 @@ size_t ZSTD_CCtxParams_getParameter( + *value = CCtxParams->forceWindow; + break; + case ZSTD_c_forceAttachDict : +- *value = CCtxParams->attachDictPref; ++ *value = (int)CCtxParams->attachDictPref; + break; + case ZSTD_c_literalCompressionMode : +- *value = CCtxParams->literalCompressionMode; ++ *value = (int)CCtxParams->literalCompressionMode; + break; + case ZSTD_c_nbWorkers : + assert(CCtxParams->nbWorkers == 0); +@@ -939,19 +1039,19 @@ size_t ZSTD_CCtxParams_getParameter( + *value = CCtxParams->enableDedicatedDictSearch; + break; + case ZSTD_c_enableLongDistanceMatching : +- *value = CCtxParams->ldmParams.enableLdm; ++ *value = (int)CCtxParams->ldmParams.enableLdm; + break; + case ZSTD_c_ldmHashLog : +- *value = CCtxParams->ldmParams.hashLog; ++ *value = (int)CCtxParams->ldmParams.hashLog; + break; + case ZSTD_c_ldmMinMatch : +- *value = CCtxParams->ldmParams.minMatchLength; ++ *value = (int)CCtxParams->ldmParams.minMatchLength; + break; + case ZSTD_c_ldmBucketSizeLog : +- *value = CCtxParams->ldmParams.bucketSizeLog; ++ *value = (int)CCtxParams->ldmParams.bucketSizeLog; + break; + case ZSTD_c_ldmHashRateLog : +- *value = CCtxParams->ldmParams.hashRateLog; ++ *value = (int)CCtxParams->ldmParams.hashRateLog; + break; + case ZSTD_c_targetCBlockSize : + *value = (int)CCtxParams->targetCBlockSize; +@@ -971,8 +1071,11 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_validateSequences : + *value = (int)CCtxParams->validateSequences; + break; +- case ZSTD_c_useBlockSplitter : +- *value = (int)CCtxParams->useBlockSplitter; ++ case ZSTD_c_splitAfterSequences : ++ *value = (int)CCtxParams->postBlockSplitter; ++ break; ++ case ZSTD_c_blockSplitterLevel : ++ *value = CCtxParams->preBlockSplitter_level; + break; + case ZSTD_c_useRowMatchFinder : + *value = (int)CCtxParams->useRowMatchFinder; +@@ -980,6 +1083,18 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_deterministicRefPrefix: *value = (int)CCtxParams->deterministicRefPrefix; break; @@ -5325,13 +6420,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + case ZSTD_c_maxBlockSize: + *value = (int)CCtxParams->maxBlockSize; + break; -+ case ZSTD_c_searchForExternalRepcodes: ++ case ZSTD_c_repcodeResolution: + *value = (int)CCtxParams->searchForExternalRepcodes; + break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; -@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxP +@@ -1006,9 +1121,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxP return 0; } @@ -5341,13 +6436,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + DEBUGLOG(4, "ZSTD_CCtx_setCParams"); + /* only update if all parameters are valid */ + FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, (int)cparams.windowLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, (int)cparams.chainLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, (int)cparams.hashLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, (int)cparams.searchLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, (int)cparams.minMatch), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, (int)cparams.targetLength), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, (int)cparams.strategy), ""); + return 0; +} + @@ -5380,7 +6475,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, "Can't set pledgedSrcSize when not in init stage."); cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; -@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_rev +@@ -1024,9 +1177,9 @@ static void ZSTD_dedicatedDictSearch_rev ZSTD_compressionParameters* cParams); /* @@ -5393,7 +6488,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) { -@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CC +@@ -1039,8 +1192,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CC return 0; } if (dl->cdict != NULL) { @@ -5403,7 +6498,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return 0; } assert(dl->dictSize > 0); -@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CC +@@ -1060,26 +1213,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CC } size_t ZSTD_CCtx_loadDictionary_advanced( @@ -5444,7 +6539,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } cctx->localDict.dictSize = dictSize; cctx->localDict.dictContentType = dictContentType; -@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, +@@ -1149,7 +1306,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, if ( (reset == ZSTD_reset_parameters) || (reset == ZSTD_reset_session_and_parameters) ) { RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, @@ -5453,7 +6548,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_clearAllDicts(cctx); return ZSTD_CCtxParams_reset(&cctx->requestedParams); } -@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressio +@@ -1168,7 +1325,7 @@ size_t ZSTD_checkCParams(ZSTD_compressio + BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog); + BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch); + BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength); +- BOUNDCHECK(ZSTD_c_strategy, cParams.strategy); ++ BOUNDCHECK(ZSTD_c_strategy, (int)cParams.strategy); + return 0; + } + +@@ -1178,11 +1335,12 @@ size_t ZSTD_checkCParams(ZSTD_compressio static ZSTD_compressionParameters ZSTD_clampCParams(ZSTD_compressionParameters cParams) { @@ -5471,13 +6575,21 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) CLAMP(ZSTD_c_windowLog, cParams.windowLog); CLAMP(ZSTD_c_chainLog, cParams.chainLog); -@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters +@@ -1240,19 +1398,62 @@ static U32 ZSTD_dictAndWindowLog(U32 win + * optimize `cPar` for a specified input (`srcSize` and `dictSize`). + * mostly downsize to reduce memory consumption and initialization latency. + * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known. +- * `mode` is the mode for parameter adjustment. See docs for `ZSTD_cParamMode_e`. ++ * `mode` is the mode for parameter adjustment. See docs for `ZSTD_CParamMode_e`. + * note : `srcSize==0` means 0! + * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */ + static ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize, - ZSTD_cParamMode_e mode) -+ ZSTD_cParamMode_e mode, -+ ZSTD_paramSwitch_e useRowMatchFinder) ++ ZSTD_CParamMode_e mode, ++ ZSTD_ParamSwitch_e useRowMatchFinder) { const U64 minSrcSize = 513; /* (1<<9) + 1 */ const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); @@ -5528,7 +6640,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> switch (mode) { case ZSTD_cpm_unknown: case ZSTD_cpm_noAttachDict: -@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compres +@@ -1281,8 +1482,8 @@ ZSTD_adjustCParams_internal(ZSTD_compres } /* resize windowLog if input is small enough, to use less memory */ @@ -5539,7 +6651,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32 const tSize = (U32)(srcSize + dictSize); static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : -@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compres +@@ -1300,6 +1501,42 @@ ZSTD_adjustCParams_internal(ZSTD_compres if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ @@ -5582,7 +6694,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return cPar; } -@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParam +@@ -1310,11 +1547,11 @@ ZSTD_adjustCParams(ZSTD_compressionParam { cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; @@ -5590,8 +6702,28 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); } - static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); -@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCPara +-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); +-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); ++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); ++static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); + + static void ZSTD_overrideCParams( + ZSTD_compressionParameters* cParams, +@@ -1330,24 +1567,25 @@ static void ZSTD_overrideCParams( + } + + ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( +- const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) ++ const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) + { + ZSTD_compressionParameters cParams; + if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { +- srcSizeHint = CCtxParams->srcSizeHint; ++ assert(CCtxParams->srcSizeHint>=0); ++ srcSizeHint = (U64)CCtxParams->srcSizeHint; + } + cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); + if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); assert(!ZSTD_checkCParams(cParams)); /* srcSizeHint == 0 means 0 */ @@ -5600,21 +6732,37 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } static size_t -@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compre - + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) - + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) - + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32)) + ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, +- const ZSTD_paramSwitch_e useRowMatchFinder, +- const U32 enableDedicatedDictSearch, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, ++ const int enableDedicatedDictSearch, + const U32 forCCtx) + { + /* chain table size should be 0 for fast or row-hash strategies */ +@@ -1363,14 +1601,14 @@ ZSTD_sizeof_matchState(const ZSTD_compre + + hSize * sizeof(U32) + + h3Size * sizeof(U32); + size_t const optPotentialSpace = +- ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32)) +- + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) +- + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) +- + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32)) - + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) - + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); -+ + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)) -+ + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); ++ ZSTD_cwksp_aligned64_alloc_size((MaxML+1) * sizeof(U32)) ++ + ZSTD_cwksp_aligned64_alloc_size((MaxLL+1) * sizeof(U32)) ++ + ZSTD_cwksp_aligned64_alloc_size((MaxOff+1) * sizeof(U32)) ++ + ZSTD_cwksp_aligned64_alloc_size((1<<Litbits) * sizeof(U32)) ++ + ZSTD_cwksp_aligned64_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)) ++ + ZSTD_cwksp_aligned64_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder) - ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) -+ ? ZSTD_cwksp_aligned_alloc_size(hSize) ++ ? ZSTD_cwksp_aligned64_alloc_size(hSize) : 0; size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) ? optPotentialSpace -@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compre +@@ -1386,30 +1624,38 @@ ZSTD_sizeof_matchState(const ZSTD_compre return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; } @@ -5628,8 +6776,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_compressionParameters* cParams, const ldmParams_t* ldmParams, -@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usin - const ZSTD_paramSwitch_e useRowMatchFinder, + const int isStatic, +- const ZSTD_paramSwitch_e useRowMatchFinder, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, const size_t buffInSize, const size_t buffOutSize, - const U64 pledgedSrcSize) @@ -5644,21 +6793,37 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); + size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) - + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) +- + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) ++ + ZSTD_cwksp_aligned64_alloc_size(maxNbSeq * sizeof(SeqDef)) + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); -@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usin +- size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); ++ size_t const tmpWorkSpace = ZSTD_cwksp_alloc_size(TMP_WORKSPACE_SIZE); + size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); + size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); + + size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); + size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? +- ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; ++ ZSTD_cwksp_aligned64_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; + + + size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) +@@ -1417,15 +1663,21 @@ static size_t ZSTD_estimateCCtxSize_usin size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); + size_t const externalSeqSpace = useSequenceProducer -+ ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) ++ ? ZSTD_cwksp_aligned64_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) + : 0; + size_t const neededSpace = cctxSpace + - entropySpace + -@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usin +- entropySpace + ++ tmpWorkSpace + + blockStateSpace + + ldmSpace + ldmSeqSpace + matchStateSize + tokenSpace + @@ -5668,7 +6833,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); return neededSpace; -@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxPa +@@ -1435,7 +1687,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxPa + { + ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ++ ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, + &cParams); + + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); +@@ -1443,7 +1695,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxPa * be needed. However, we still allocate two 0-sized buffers, which can * take space under ASAN. */ return ZSTD_estimateCCtxSize_usingCCtxParams_internal( @@ -5677,7 +6851,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) -@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCt +@@ -1493,18 +1745,18 @@ size_t ZSTD_estimateCStreamSize_usingCCt RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); { ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); @@ -5686,7 +6860,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) ? ((size_t)1 << cParams.windowLog) + blockSize : 0; -@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCt + size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +- ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); ++ ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); return ZSTD_estimateCCtxSize_usingCCtxParams_internal( &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, @@ -5695,7 +6873,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } -@@ -1637,6 +1879,19 @@ typedef enum { +@@ -1600,7 +1852,7 @@ void ZSTD_reset_compressedBlockState(ZST + * Invalidate all the matches in the match finder tables. + * Requires nextSrc and base to be set (can be NULL). + */ +-static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms) ++static void ZSTD_invalidateMatchState(ZSTD_MatchState_t* ms) + { + ZSTD_window_clear(&ms->window); + +@@ -1637,12 +1889,25 @@ typedef enum { ZSTD_resetTarget_CCtx } ZSTD_resetTarget_e; @@ -5709,13 +6896,21 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +} + +/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ -+static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) { ++static void ZSTD_advanceHashSalt(ZSTD_MatchState_t* ms) { + ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); +} static size_t - ZSTD_reset_matchState(ZSTD_matchState_t* ms, -@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* +-ZSTD_reset_matchState(ZSTD_matchState_t* ms, ++ZSTD_reset_matchState(ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + const ZSTD_compressionParameters* cParams, +- const ZSTD_paramSwitch_e useRowMatchFinder, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, + const ZSTD_compResetPolicy_e crp, + const ZSTD_indexResetPolicy_e forceResetIndex, + const ZSTD_resetTarget_e forWho) +@@ -1664,6 +1929,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* } ms->hashLog3 = hashLog3; @@ -5723,7 +6918,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_invalidateMatchState(ms); -@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* +@@ -1685,22 +1951,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ZSTD_cwksp_clean_tables(ws); } @@ -5752,39 +6947,51 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + ZSTD_advanceHashSalt(ms); + } else { + /* When we are not salting we want to always memset the memory */ -+ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned64(ws, tagTableSize); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ms->hashSalt = 0; } { /* Switch to 32-entry rows if searchLog is 5 (or more) */ U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); -@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* +@@ -1709,6 +1972,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* } } + /* opt parser space */ + if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { + DEBUGLOG(4, "reserving optimal parser space"); -+ ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned)); -+ ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); -+ ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); -+ ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); -+ ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); -+ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); ++ ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (1<<Litbits) * sizeof(unsigned)); ++ ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxLL+1) * sizeof(unsigned)); ++ ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxML+1) * sizeof(unsigned)); ++ ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxOff+1) * sizeof(unsigned)); ++ ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); ++ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); + } + ms->cParams = *cParams; RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, -@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZS +@@ -1754,7 +2028,7 @@ static size_t ZSTD_resetCCtx_internal(ZS + { + ZSTD_cwksp* const ws = &zc->workspace; + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d", +- (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter); ++ (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + + zc->isFirstBlock = 1; +@@ -1766,8 +2040,9 @@ static size_t ZSTD_resetCCtx_internal(ZS + params = &zc->appliedParams; + assert(params->useRowMatchFinder != ZSTD_ps_auto); - assert(params->useBlockSplitter != ZSTD_ps_auto); +- assert(params->useBlockSplitter != ZSTD_ps_auto); ++ assert(params->postBlockSplitter != ZSTD_ps_auto); assert(params->ldmParams.enableLdm != ZSTD_ps_auto); + assert(params->maxBlockSize != 0); if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* Adjust long distance matching parameters */ ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); -@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZS +@@ -1776,9 +2051,8 @@ static size_t ZSTD_resetCCtx_internal(ZS } { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); @@ -5796,7 +7003,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; -@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZS +@@ -1795,8 +2069,7 @@ static size_t ZSTD_resetCCtx_internal(ZS size_t const neededSpace = ZSTD_estimateCCtxSize_usingCCtxParams_internal( ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, @@ -5806,7 +7013,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); -@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZS +@@ -1805,7 +2078,7 @@ static size_t ZSTD_resetCCtx_internal(ZS { /* Check if workspace is large enough, alloc a new one if needed */ int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); @@ -5815,7 +7022,26 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> DEBUGLOG(4, "Need %zu B workspace", neededSpace); DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); -@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZS +@@ -1823,21 +2096,23 @@ static size_t ZSTD_resetCCtx_internal(ZS + + DEBUGLOG(5, "reserving object space"); + /* Statically sized space. +- * entropyWorkspace never moves, ++ * tmpWorkspace never moves, + * though prev/next block swap places */ + assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t))); + zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock"); + zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); +- zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE); +- RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); ++ zc->tmpWorkspace = ZSTD_cwksp_reserve_object(ws, TMP_WORKSPACE_SIZE); ++ RETURN_ERROR_IF(zc->tmpWorkspace == NULL, memory_allocation, "couldn't allocate tmpWorkspace"); ++ zc->tmpWkspSize = TMP_WORKSPACE_SIZE; + } } + + ZSTD_cwksp_clear(ws); /* init params */ zc->blockState.matchState.cParams = params->cParams; @@ -5823,7 +7049,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; zc->consumedSrcSize = 0; zc->producedCSize = 0; -@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZS +@@ -1845,7 +2120,7 @@ static size_t ZSTD_resetCCtx_internal(ZS + zc->appliedParams.fParams.contentSizeFlag = 0; + DEBUGLOG(4, "pledged content size : %u ; flag : %u", + (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag); +- zc->blockSize = blockSize; ++ zc->blockSizeMax = blockSize; + + xxh64_reset(&zc->xxhState, 0); + zc->stage = ZSTDcs_init; +@@ -1854,13 +2129,46 @@ static size_t ZSTD_resetCCtx_internal(ZS ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); @@ -5836,15 +7071,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + needsIndexReset, + ZSTD_resetTarget_CCtx), ""); + -+ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); ++ zc->seqStore.sequencesStart = (SeqDef*)ZSTD_cwksp_reserve_aligned64(ws, maxNbSeq * sizeof(SeqDef)); + + /* ldm hash table */ + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* TODO: avoid memset? */ + size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; -+ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); ++ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned64(ws, ldmHSize * sizeof(ldmEntry_t)); + ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); -+ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); ++ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned64(ws, maxNbLdmSeq * sizeof(rawSeq)); + zc->maxNbLdmSequences = maxNbLdmSeq; + + ZSTD_window_init(&zc->ldmState.window); @@ -5856,7 +7091,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); + zc->extSeqBufCapacity = maxNbExternalSeq; + zc->extSeqBuf = -+ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); ++ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned64(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); + } + + /* buffers */ @@ -5871,7 +7106,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> zc->bufferedPolicy = zbuff; zc->inBuffSize = buffInSize; zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); -@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZS +@@ -1883,32 +2191,9 @@ static size_t ZSTD_resetCCtx_internal(ZS zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); @@ -5905,7 +7140,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> zc->initialized = 1; -@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCt +@@ -1980,7 +2265,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCt } params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, @@ -5915,7 +7150,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> params.cParams.windowLog = windowLog; params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, -@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCt +@@ -2019,6 +2305,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCt return 0; } @@ -5938,7 +7173,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, ZSTD_CCtx_params params, -@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCD +@@ -2054,26 +2356,29 @@ static size_t ZSTD_resetCCtx_byCopyingCD : 0; size_t const hSize = (size_t)1 << cdict_cParams->hashLog; @@ -5971,24 +7206,89 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } -@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZST - params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; + /* Zero the hashTable3, since the cdict never fills it */ +- { int const h3log = cctx->blockState.matchState.hashLog3; ++ assert(cctx->blockState.matchState.hashLog3 <= 31); ++ { U32 const h3log = cctx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + assert(cdict->matchState.hashLog3 == 0); + ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); +@@ -2082,8 +2387,8 @@ static size_t ZSTD_resetCCtx_byCopyingCD + ZSTD_cwksp_mark_tables_clean(&cctx->workspace); + + /* copy dictionary offsets */ +- { ZSTD_matchState_t const* srcMatchState = &cdict->matchState; +- ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState; ++ { ZSTD_MatchState_t const* srcMatchState = &cdict->matchState; ++ ZSTD_MatchState_t* dstMatchState = &cctx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; +@@ -2141,12 +2446,13 @@ static size_t ZSTD_copyCCtx_internal(ZST + /* Copy only compression parameters related to tables. */ + params.cParams = srcCCtx->appliedParams.cParams; + assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto); +- assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto); ++ assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto); + assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto); + params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; +- params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; ++ params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter; params.ldmParams = srcCCtx->appliedParams.ldmParams; params.fParams = fParams; + params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff); -@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_match +@@ -2166,7 +2472,7 @@ static size_t ZSTD_copyCCtx_internal(ZST + ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) + : 0; + size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; +- int const h3log = srcCCtx->blockState.matchState.hashLog3; ++ U32 const h3log = srcCCtx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + + ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable, +@@ -2184,8 +2490,8 @@ static size_t ZSTD_copyCCtx_internal(ZST + + /* copy dictionary offsets */ + { +- const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState; +- ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState; ++ const ZSTD_MatchState_t* srcMatchState = &srcCCtx->blockState.matchState; ++ ZSTD_MatchState_t* dstMatchState = &dstCCtx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; +@@ -2234,7 +2540,7 @@ ZSTD_reduceTable_internal (U32* const ta + /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ + U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX; + assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ +- assert(size < (1U<<31)); /* can be casted to int */ ++ assert(size < (1U<<31)); /* can be cast to int */ + + + for (rowNb=0 ; rowNb < nbRows ; rowNb++) { +@@ -2267,7 +2573,7 @@ static void ZSTD_reduceTable_btlazy2(U32 + + /*! ZSTD_reduceIndex() : + * rescale all indexes to avoid future overflow (indexes are U32) */ +-static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue) ++static void ZSTD_reduceIndex (ZSTD_MatchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue) + { + { U32 const hSize = (U32)1 << params->cParams.hashLog; + ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); +@@ -2294,26 +2600,32 @@ static void ZSTD_reduceIndex (ZSTD_match /* See doc/zstd_compression_format.md for detailed format description */ -void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) -+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr) ++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr) { - const seqDef* const sequences = seqStorePtr->sequencesStart; +- const seqDef* const sequences = seqStorePtr->sequencesStart; ++ const SeqDef* const sequences = seqStorePtr->sequencesStart; BYTE* const llCodeTable = seqStorePtr->llCode; -@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* s + BYTE* const ofCodeTable = seqStorePtr->ofCode; BYTE* const mlCodeTable = seqStorePtr->mlCode; U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); U32 u; @@ -6014,7 +7314,20 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* ZSTD_useTargetCBlockSize(): -@@ -2347,6 +2647,7 @@ typedef struct { +@@ -2333,9 +2645,9 @@ static int ZSTD_useTargetCBlockSize(cons + * Returns 1 if true, 0 otherwise. */ + static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) + { +- DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter); +- assert(cctxParams->useBlockSplitter != ZSTD_ps_auto); +- return (cctxParams->useBlockSplitter == ZSTD_ps_enable); ++ DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter); ++ assert(cctxParams->postBlockSplitter != ZSTD_ps_auto); ++ return (cctxParams->postBlockSplitter == ZSTD_ps_enable); + } + + /* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types +@@ -2347,6 +2659,7 @@ typedef struct { U32 MLtype; size_t size; size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ @@ -6022,7 +7335,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } ZSTD_symbolEncodingTypeStats_t; /* ZSTD_buildSequencesStatistics(): -@@ -2357,11 +2658,13 @@ typedef struct { +@@ -2357,11 +2670,13 @@ typedef struct { * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) */ static ZSTD_symbolEncodingTypeStats_t @@ -6032,7 +7345,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - ZSTD_strategy strategy, unsigned* countWorkspace, - void* entropyWorkspace, size_t entropyWkspSize) { +ZSTD_buildSequencesStatistics( -+ const seqStore_t* seqStorePtr, size_t nbSeq, ++ const SeqStore_t* seqStorePtr, size_t nbSeq, + const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, + BYTE* dst, const BYTE* const dstEnd, + ZSTD_strategy strategy, unsigned* countWorkspace, @@ -6041,7 +7354,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> BYTE* const ostart = dst; const BYTE* const oend = dstEnd; BYTE* op = ostart; -@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t +@@ -2375,7 +2690,7 @@ ZSTD_buildSequencesStatistics(seqStore_t stats.lastCountSize = 0; /* convert length/distances into codes */ @@ -6050,7 +7363,43 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> assert(op <= oend); assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ /* build CTable for Literal Lengths */ -@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t +@@ -2392,7 +2707,7 @@ ZSTD_buildSequencesStatistics(seqStore_t + assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), +- CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype, ++ CTable_LitLength, LLFSELog, (SymbolEncodingType_e)stats.LLtype, + countWorkspace, max, llCodeTable, nbSeq, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, +@@ -2413,7 +2728,7 @@ ZSTD_buildSequencesStatistics(seqStore_t + size_t const mostFrequent = HIST_countFast_wksp( + countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ +- ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; ++ ZSTD_DefaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, +@@ -2424,7 +2739,7 @@ ZSTD_buildSequencesStatistics(seqStore_t + assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), +- CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype, ++ CTable_OffsetBits, OffFSELog, (SymbolEncodingType_e)stats.Offtype, + countWorkspace, max, ofCodeTable, nbSeq, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, +@@ -2454,7 +2769,7 @@ ZSTD_buildSequencesStatistics(seqStore_t + assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), +- CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype, ++ CTable_MatchLength, MLFSELog, (SymbolEncodingType_e)stats.MLtype, + countWorkspace, max, mlCodeTable, nbSeq, + ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, +@@ -2480,22 +2795,23 @@ ZSTD_buildSequencesStatistics(seqStore_t */ #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 MEM_STATIC size_t @@ -6062,11 +7411,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - void* entropyWorkspace, size_t entropyWkspSize, - const int bmi2) +ZSTD_entropyCompressSeqStore_internal( -+ const seqStore_t* seqStorePtr, ++ void* dst, size_t dstCapacity, ++ const void* literals, size_t litSize, ++ const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, -+ void* dst, size_t dstCapacity, + void* entropyWorkspace, size_t entropyWkspSize, + const int bmi2) { @@ -6076,13 +7426,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; - const seqDef* const sequences = seqStorePtr->sequencesStart; +- const seqDef* const sequences = seqStorePtr->sequencesStart; - const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ const SeqDef* const sequences = seqStorePtr->sequencesStart; + const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); const BYTE* const ofCodeTable = seqStorePtr->ofCode; const BYTE* const llCodeTable = seqStorePtr->llCode; const BYTE* const mlCodeTable = seqStorePtr->mlCode; -@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(se +@@ -2503,29 +2819,28 @@ ZSTD_entropyCompressSeqStore_internal(se BYTE* const oend = ostart + dstCapacity; BYTE* op = ostart; size_t lastCountSize; @@ -6097,14 +7448,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> assert(entropyWkspSize >= HUF_WORKSPACE_SIZE); /* Compress literals */ - { const BYTE* const literals = seqStorePtr->litStart; +- { const BYTE* const literals = seqStorePtr->litStart; - size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; - size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; -+ size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); -+ size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); ++ { size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); /* Base suspicion of uncompressibility on ratio of literals to sequences */ - unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); - size_t const litSize = (size_t)(seqStorePtr->lit - literals); +- unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); +- size_t const litSize = (size_t)(seqStorePtr->lit - literals); ++ int const suspectUncompressible = (numSequences == 0) || (litSize / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); + size_t const cSize = ZSTD_compressLiterals( - &prevEntropy->huf, &nextEntropy->huf, @@ -6121,7 +7472,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); assert(cSize <= dstCapacity); op += cSize; -@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(se +@@ -2551,11 +2866,10 @@ ZSTD_entropyCompressSeqStore_internal(se ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); return (size_t)(op - ostart); } @@ -6136,7 +7487,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> &prevEntropy->fse, &nextEntropy->fse, op, oend, strategy, count, -@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(se +@@ -2564,6 +2878,7 @@ ZSTD_entropyCompressSeqStore_internal(se *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); lastCountSize = stats.lastCountSize; op += stats.size; @@ -6144,10 +7495,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } { size_t const bitstreamSize = ZSTD_encodeSequences( -@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(se +@@ -2597,104 +2912,146 @@ ZSTD_entropyCompressSeqStore_internal(se + return (size_t)(op - ostart); } - MEM_STATIC size_t +-MEM_STATIC size_t -ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, - const ZSTD_entropyCTables_t* prevEntropy, - ZSTD_entropyCTables_t* nextEntropy, @@ -6156,31 +7508,38 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - size_t srcSize, - void* entropyWorkspace, size_t entropyWkspSize, - int bmi2) -+ZSTD_entropyCompressSeqStore( -+ const seqStore_t* seqStorePtr, ++static size_t ++ZSTD_entropyCompressSeqStore_wExtLitBuffer( ++ void* dst, size_t dstCapacity, ++ const void* literals, size_t litSize, ++ size_t blockSize, ++ const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, -+ void* dst, size_t dstCapacity, -+ size_t srcSize, + void* entropyWorkspace, size_t entropyWkspSize, + int bmi2) { size_t const cSize = ZSTD_entropyCompressSeqStore_internal( - seqStorePtr, prevEntropy, nextEntropy, cctxParams, -@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* +- seqStorePtr, prevEntropy, nextEntropy, cctxParams, + dst, dstCapacity, ++ literals, litSize, ++ seqStorePtr, prevEntropy, nextEntropy, cctxParams, + entropyWorkspace, entropyWkspSize, bmi2); + if (cSize == 0) return 0; /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. */ - if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) -+ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) { ++ if ((cSize == ERROR(dstSize_tooSmall)) & (blockSize <= dstCapacity)) { + DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); return 0; /* block not compressed */ + } FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); /* Check compressibility */ - { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); +- { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); ++ { size_t const maxCSize = blockSize - ZSTD_minGain(blockSize, cctxParams->cParams.strategy); if (cSize >= maxCSize) return 0; /* block not compressed */ } - DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); @@ -6192,8 +7551,36 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return cSize; } -@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCom - static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { ++static size_t ++ZSTD_entropyCompressSeqStore( ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ size_t srcSize, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) ++{ ++ return ZSTD_entropyCompressSeqStore_wExtLitBuffer( ++ dst, dstCapacity, ++ seqStorePtr->litStart, (size_t)(seqStorePtr->lit - seqStorePtr->litStart), ++ srcSize, ++ seqStorePtr, ++ prevEntropy, nextEntropy, ++ cctxParams, ++ entropyWorkspace, entropyWkspSize, ++ bmi2); ++} ++ + /* ZSTD_selectBlockCompressor() : + * Not static, but internal use only (used by long distance matcher) + * assumption : strat is a valid strategy */ +-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) ++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) + { +- static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { ++ static const ZSTD_BlockCompressor_f blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { { ZSTD_compressBlock_fast /* default for 0 */, ZSTD_compressBlock_fast, - ZSTD_compressBlock_doubleFast, @@ -6263,10 +7650,18 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> NULL, NULL, NULL, -@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCom - DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + NULL } + }; +- ZSTD_blockCompressor selectedCompressor; ++ ZSTD_BlockCompressor_f selectedCompressor; + ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); + +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); +- DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); ++ DEBUGLOG(5, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { - static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { +- static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { - { ZSTD_compressBlock_greedy_row, - ZSTD_compressBlock_lazy_row, - ZSTD_compressBlock_lazy2_row }, @@ -6279,6 +7674,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, - ZSTD_compressBlock_lazy_dedicatedDictSearch_row, - ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } ++ static const ZSTD_BlockCompressor_f rowBasedBlockCompressors[4][3] = { + { + ZSTD_COMPRESSBLOCK_GREEDY_ROW, + ZSTD_COMPRESSBLOCK_LAZY_ROW, @@ -6300,12 +7696,32 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW + } }; - DEBUGLOG(4, "Selecting a row-based matchfinder"); +- DEBUGLOG(4, "Selecting a row-based matchfinder"); ++ DEBUGLOG(5, "Selecting a row-based matchfinder"); assert(useRowMatchFinder != ZSTD_ps_auto); -@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPt + selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; + } else { +@@ -2704,30 +3061,126 @@ ZSTD_blockCompressor ZSTD_selectBlockCom + return selectedCompressor; + } + +-static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr, ++static void ZSTD_storeLastLiterals(SeqStore_t* seqStorePtr, + const BYTE* anchor, size_t lastLLSize) + { + ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } + +-void ZSTD_resetSeqStore(seqStore_t* ssPtr) ++void ZSTD_resetSeqStore(SeqStore_t* ssPtr) + { + ssPtr->lit = ssPtr->litStart; + ssPtr->sequences = ssPtr->sequencesStart; ssPtr->longLengthType = ZSTD_llt_none; } +-typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; +/* ZSTD_postProcessSequenceProducerResult() : + * Validates and post-processes sequences obtained through the external matchfinder API: + * - Checks whether nbExternalSeqs represents an error condition. @@ -6372,10 +7788,41 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + return litLenSum + matchLenSum; +} + - typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; ++/* ++ * Function to validate sequences produced by a block compressor. ++ */ ++static void ZSTD_validateSeqStore(const SeqStore_t* seqStore, const ZSTD_compressionParameters* cParams) ++{ ++#if DEBUGLEVEL >= 1 ++ const SeqDef* seq = seqStore->sequencesStart; ++ const SeqDef* const seqEnd = seqStore->sequences; ++ size_t const matchLenLowerBound = cParams->minMatch == 3 ? 3 : 4; ++ for (; seq < seqEnd; ++seq) { ++ const ZSTD_SequenceLength seqLength = ZSTD_getSequenceLength(seqStore, seq); ++ assert(seqLength.matchLength >= matchLenLowerBound); ++ (void)seqLength; ++ (void)matchLenLowerBound; ++ } ++#else ++ (void)seqStore; ++ (void)cParams; ++#endif ++} ++ ++static size_t ++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch); ++ ++typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_BuildSeqStore_e; static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) -@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC + { +- ZSTD_matchState_t* const ms = &zc->blockState.matchState; ++ ZSTD_MatchState_t* const ms = &zc->blockState.matchState; + DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize); assert(srcSize <= ZSTD_BLOCKSIZE_MAX); /* Assert that we have correctly flushed the ctx params into the ms's copy */ ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); @@ -6386,7 +7833,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); } else { -@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC +@@ -2763,6 +3216,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC } if (zc->externSeqStore.pos < zc->externSeqStore.size) { assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); @@ -6402,10 +7849,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Updates ldmSeqStore.pos */ lastLLSize = ZSTD_ldm_blockCompress(&zc->externSeqStore, -@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC +@@ -2772,7 +3234,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC + src, srcSize); + assert(zc->externSeqStore.pos <= zc->externSeqStore.size); } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { - rawSeqStore_t ldmSeqStore = kNullRawSeqStore; - +- rawSeqStore_t ldmSeqStore = kNullRawSeqStore; ++ RawSeqStore_t ldmSeqStore = kNullRawSeqStore; ++ + /* External matchfinder + LDM is technically possible, just not implemented yet. + * We need to revisit soon and implement it. */ + RETURN_ERROR_IF( @@ -6413,11 +7863,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + parameter_combination_unsupported, + "Long-distance matching with external sequence producer enabled is not currently supported." + ); -+ + ldmSeqStore.seq = zc->ldmSequences; ldmSeqStore.capacity = zc->maxNbLdmSequences; - /* Updates ldmSeqStore.size */ -@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC +@@ -2788,42 +3258,116 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos == ldmSeqStore.size); @@ -6452,11 +7901,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + + /* Return early if there is no error, since we don't need to worry about last literals */ + if (!ZSTD_isError(nbPostProcessedSeqs)) { -+ ZSTD_sequencePosition seqPos = {0,0,0}; ++ ZSTD_SequencePosition seqPos = {0,0,0}; + size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); + RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); + FORWARD_IF_ERROR( -+ ZSTD_copySequencesToSeqStoreExplicitBlockDelim( ++ ZSTD_transferSequences_wBlockDelim( + zc, &seqPos, + zc->extSeqBuf, nbPostProcessedSeqs, + src, srcSize, @@ -6475,7 +7924,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + } + + /* Fallback to software matchfinder */ -+ { ZSTD_blockCompressor const blockCompressor = ++ { ZSTD_BlockCompressor_f const blockCompressor = + ZSTD_selectBlockCompressor( + zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, @@ -6489,19 +7938,22 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } } + } else { /* not long range mode and no external matchfinder */ -+ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor( ++ ZSTD_BlockCompressor_f const blockCompressor = ZSTD_selectBlockCompressor( + zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); ms->ldmSeqStore = NULL; lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } -@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CC + { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize; + ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize); + } } ++ ZSTD_validateSeqStore(&zc->seqStore, &zc->appliedParams.cParams); return ZSTDbss_compress; } -static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) -+static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) ++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const SeqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) { - const seqStore_t* seqStore = ZSTD_getSeqStore(zc); - const seqDef* seqStoreSeqs = seqStore->sequencesStart; @@ -6511,14 +7963,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - size_t lastLLSize; - - ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; -+ const seqDef* inSeqs = seqStore->sequencesStart; -+ const size_t nbInSequences = seqStore->sequences - inSeqs; ++ const SeqDef* inSeqs = seqStore->sequencesStart; ++ const size_t nbInSequences = (size_t)(seqStore->sequences - inSeqs); + const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); + + ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; + const size_t nbOutSequences = nbInSequences + 1; + size_t nbOutLiterals = 0; -+ repcodes_t repcodes; ++ Repcodes_t repcodes; size_t i; - repcodes_t updatedRepcodes; @@ -6553,7 +8005,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (i == seqStore->longLengthPos) { if (seqStore->longLengthType == ZSTD_llt_literalLength) { outSeqs[i].litLength += 0x10000; -@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD +@@ -2832,46 +3376,75 @@ static void ZSTD_copyBlockSequences(ZSTD } } @@ -6628,9 +8080,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, -@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* + size_t outSeqsSize, const void* src, size_t srcSize) + { const size_t dstCapacity = ZSTD_compressBound(srcSize); - void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); +- void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); ++ void* dst; /* Make C90 happy. */ SeqCollector seqCollector; + { + int targetCBlockSize; @@ -6643,9 +8097,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); + } ++ dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); -@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* + seqCollector.collectSequences = 1; +@@ -2880,8 +3453,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* seqCollector.maxSequences = outSeqsSize; zc->seqCollector = seqCollector; @@ -6660,7 +8116,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return zc->seqCollector.seqIndex; } -@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, s +@@ -2910,19 +3487,17 @@ static int ZSTD_isRLE(const BYTE* src, s const size_t unrollMask = unrollSize - 1; const size_t prefixLength = length & unrollMask; size_t i; @@ -6682,7 +8138,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return 1; } -@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t cons +@@ -2930,7 +3505,7 @@ static int ZSTD_isRLE(const BYTE* src, s + * This is just a heuristic based on the compressibility. + * It may return both false positives and false negatives. + */ +-static int ZSTD_maybeRLE(seqStore_t const* seqStore) ++static int ZSTD_maybeRLE(SeqStore_t const* seqStore) + { + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart); +@@ -2938,7 +3513,8 @@ static int ZSTD_maybeRLE(seqStore_t cons return nbSeqs < 4 && nbLits < 10; } @@ -6692,7 +8157,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; bs->prevCBlock = bs->nextCBlock; -@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepco +@@ -2946,12 +3522,14 @@ static void ZSTD_blockState_confirmRepco } /* Writes the block header */ @@ -6703,7 +8168,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32 const cBlockHeader = cSize == 1 ? lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); -@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, s + MEM_writeLE24(op, cBlockHeader); +- DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); ++ DEBUGLOG(5, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); + } + + /* ZSTD_buildBlockEntropyStats_literals() : +@@ -2959,13 +3537,16 @@ static void writeBlockHeader(void* op, s * Stores literals block type (raw, rle, compressed, repeat) and * huffman description table to hufMetadata. * Requires ENTROPY_WORKSPACE_SIZE workspace @@ -6727,7 +8198,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { BYTE* const wkspStart = (BYTE*)workspace; BYTE* const wkspEnd = wkspStart + wkspSize; -@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStat +@@ -2973,9 +3554,9 @@ static size_t ZSTD_buildBlockEntropyStat unsigned* const countWksp = (unsigned*)workspace; const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); BYTE* const nodeWksp = countWkspStart + countWkspSize; @@ -6739,7 +8210,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> HUF_repeat repeat = prevHuf->repeatMode; DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); -@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStat +@@ -2990,73 +3571,77 @@ static size_t ZSTD_buildBlockEntropyStat /* small ? don't even attempt compression (speed opt) */ #ifndef COMPRESS_LITERALS_SIZE_MIN @@ -6851,7 +8322,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } -@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStat +@@ -3066,8 +3651,9 @@ static size_t ZSTD_buildBlockEntropyStat * and updates nextEntropy to the appropriate repeatMode. */ static ZSTD_symbolEncodingTypeStats_t @@ -6863,7 +8334,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> nextEntropy->litlength_repeatMode = FSE_repeat_none; nextEntropy->offcode_repeatMode = FSE_repeat_none; nextEntropy->matchlength_repeatMode = FSE_repeat_none; -@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_ +@@ -3078,16 +3664,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_ * Builds entropy for the sequences. * Stores symbol compression modes and fse table to fseMetadata. * Requires ENTROPY_WORKSPACE_SIZE wksp. @@ -6877,7 +8348,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * @return : size of fse tables or error code */ +static size_t +ZSTD_buildBlockEntropyStats_sequences( -+ const seqStore_t* seqStorePtr, ++ const SeqStore_t* seqStorePtr, + const ZSTD_fseCTables_t* prevEntropy, + ZSTD_fseCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, @@ -6890,7 +8361,20 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> BYTE* const ostart = fseMetadata->fseTablesBuffer; BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); BYTE* op = ostart; -@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStat +@@ -3103,9 +3691,9 @@ static size_t ZSTD_buildBlockEntropyStat + entropyWorkspace, entropyWorkspaceSize) + : ZSTD_buildDummySequencesStatistics(nextEntropy); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); +- fseMetadata->llType = (symbolEncodingType_e) stats.LLtype; +- fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype; +- fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype; ++ fseMetadata->llType = (SymbolEncodingType_e) stats.LLtype; ++ fseMetadata->ofType = (SymbolEncodingType_e) stats.Offtype; ++ fseMetadata->mlType = (SymbolEncodingType_e) stats.MLtype; + fseMetadata->lastCountSize = stats.lastCountSize; + return stats.size; + } +@@ -3114,23 +3702,28 @@ static size_t ZSTD_buildBlockEntropyStat /* ZSTD_buildBlockEntropyStats() : * Builds entropy for the block. * Requires workspace size ENTROPY_WORKSPACE_SIZE @@ -6908,7 +8392,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -{ - size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; +size_t ZSTD_buildBlockEntropyStats( -+ const seqStore_t* seqStorePtr, ++ const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, @@ -6930,7 +8414,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); entropyMetadata->fseMetadata.fseTablesSize = ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, -@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqSt +@@ -3143,11 +3736,12 @@ size_t ZSTD_buildBlockEntropyStats(seqSt } /* Returns the size estimate for the literals section (header + content) of a block */ @@ -6948,7 +8432,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { unsigned* const countWksp = (unsigned*)workspace; unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; -@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_lit +@@ -3169,12 +3763,13 @@ static size_t ZSTD_estimateBlockSize_lit } /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ @@ -6959,7 +8443,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, - void* workspace, size_t wkspSize) +static size_t -+ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, ++ZSTD_estimateBlockSize_symbolType(SymbolEncodingType_e type, + const BYTE* codeTable, size_t nbSeq, unsigned maxCode, + const FSE_CTable* fseCTable, + const U8* additionalBits, @@ -6968,7 +8452,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { unsigned* const countWksp = (unsigned*)workspace; const BYTE* ctp = codeTable; -@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_sym +@@ -3206,116 +3801,121 @@ static size_t ZSTD_estimateBlockSize_sym } /* Returns the size estimate for the sequences section (header + content) of a block */ @@ -7059,7 +8543,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { - ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; +static size_t -+ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) ++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(SeqStore_t* seqStore, ZSTD_CCtx* zc) +{ + ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); @@ -7070,7 +8554,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> entropyMetadata, - zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); - return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), -+ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), ""); ++ zc->tmpWorkspace, zc->tmpWkspSize), ""); + return ZSTD_estimateBlockSize( + seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), seqStore->ofCode, seqStore->llCode, seqStore->mlCode, @@ -7078,13 +8562,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, + &zc->blockState.nextCBlock->entropy, + entropyMetadata, -+ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, ++ zc->tmpWorkspace, zc->tmpWkspSize, (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); } /* Returns literals bytes represented in a seqStore */ -static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { -+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) ++static size_t ZSTD_countSeqStoreLiteralsBytes(const SeqStore_t* const seqStore) +{ size_t literalsBytes = 0; - size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; @@ -7092,7 +8576,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t i; for (i = 0; i < nbSeqs; ++i) { - seqDef seq = seqStore->sequencesStart[i]; -+ seqDef const seq = seqStore->sequencesStart[i]; ++ SeqDef const seq = seqStore->sequencesStart[i]; literalsBytes += seq.litLength; if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { literalsBytes += 0x10000; @@ -7104,14 +8588,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Returns match bytes represented in a seqStore */ -static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { -+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) ++static size_t ZSTD_countSeqStoreMatchBytes(const SeqStore_t* const seqStore) +{ size_t matchBytes = 0; - size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); size_t i; for (i = 0; i < nbSeqs; ++i) { - seqDef seq = seqStore->sequencesStart[i]; +- seqDef seq = seqStore->sequencesStart[i]; ++ SeqDef seq = seqStore->sequencesStart[i]; matchBytes += seq.mlBase + MINMATCH; if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { matchBytes += 0x10000; @@ -7121,15 +8606,18 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return matchBytes; } -@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchByt + /* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). + * Stores the result in resultSeqStore. */ - static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, - const seqStore_t* originalSeqStore, +-static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, +- const seqStore_t* originalSeqStore, - size_t startIdx, size_t endIdx) { - BYTE* const litEnd = originalSeqStore->lit; - size_t literalsBytes; - size_t literalsBytesPreceding = 0; - ++static void ZSTD_deriveSeqStoreChunk(SeqStore_t* resultSeqStore, ++ const SeqStore_t* originalSeqStore, + size_t startIdx, size_t endIdx) +{ *resultSeqStore = *originalSeqStore; @@ -7140,7 +8628,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* Move longLengthPos into the correct position if necessary */ -@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seq +@@ -3328,13 +3928,12 @@ static void ZSTD_deriveSeqStoreChunk(seq } resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; @@ -7157,7 +8645,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } resultSeqStore->llCode += startIdx; resultSeqStore->mlCode += startIdx; -@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seq +@@ -3342,20 +3941,26 @@ static void ZSTD_deriveSeqStoreChunk(seq } /* @@ -7193,26 +8681,27 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* -@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 +@@ -3371,30 +3976,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 * 1-3 : repcode 1-3 * 4+ : real_offset+3 */ -static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, - seqStore_t* const seqStore, U32 const nbSeq) { +static void -+ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, -+ const seqStore_t* const seqStore, U32 const nbSeq) ++ZSTD_seqStore_resolveOffCodes(Repcodes_t* const dRepcodes, Repcodes_t* const cRepcodes, ++ const SeqStore_t* const seqStore, U32 const nbSeq) +{ U32 idx = 0; + U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; for (; idx < nbSeq; ++idx) { - seqDef* const seq = seqStore->sequencesStart + idx; +- seqDef* const seq = seqStore->sequencesStart + idx; - U32 const ll0 = (seq->litLength == 0); - U32 const offCode = OFFBASE_TO_STORED(seq->offBase); - assert(seq->offBase > 0); - if (STORED_IS_REPCODE(offCode)) { - U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); - U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); ++ SeqDef* const seq = seqStore->sequencesStart + idx; + U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); + U32 const offBase = seq->offBase; + assert(offBase > 0); @@ -7238,21 +8727,40 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } -@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCode +@@ -3404,10 +4012,11 @@ static void ZSTD_seqStore_resolveOffCode * Returns the total size of that block (including header) or a ZSTD error code. */ static size_t -ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, +- repcodes_t* const dRep, repcodes_t* const cRep, +ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, -+ const seqStore_t* const seqStore, - repcodes_t* const dRep, repcodes_t* const cRep, ++ const SeqStore_t* const seqStore, ++ Repcodes_t* const dRep, Repcodes_t* const cRep, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, + const void* src, size_t srcSize, U32 lastBlock, U32 isPartition) { const U32 rleMaxLength = 25; -@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_C +@@ -3417,7 +4026,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_C + size_t cSeqsSize; + + /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ +- repcodes_t const dRepOriginal = *dRep; ++ Repcodes_t const dRepOriginal = *dRep; + DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); + if (isPartition) + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); +@@ -3428,7 +4037,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_C + &zc->appliedParams, + op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, + srcSize, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); + +@@ -3442,8 +4051,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_C cSeqsSize = 1; } @@ -7263,7 +8771,29 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } -@@ -3481,45 +4027,49 @@ typedef struct { +@@ -3451,18 +4061,18 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_C + if (cSeqsSize == 0) { + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else if (cSeqsSize == 1) { + cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); + cSize = ZSTD_blockHeaderSize + cSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cSize); + } + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) +@@ -3481,45 +4091,49 @@ typedef struct { /* Helper function to perform the recursive search for block splits. * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. @@ -7285,14 +8815,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ static void ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, - ZSTD_CCtx* zc, const seqStore_t* origSeqStore) +- ZSTD_CCtx* zc, const seqStore_t* origSeqStore) ++ ZSTD_CCtx* zc, const SeqStore_t* origSeqStore) { - seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; - seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; - seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; -+ seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; -+ seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; -+ seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; ++ SeqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; ++ SeqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; ++ SeqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; size_t estimatedOriginalSize; size_t estimatedFirstHalfSize; size_t estimatedSecondHalfSize; @@ -7323,7 +8854,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); splits->splitLocations[splits->idx] = (U32)midIdx; splits->idx++; -@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSpl +@@ -3527,14 +4141,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSpl } } @@ -7347,7 +8878,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Refuse to try and split anything with less than 4 sequences */ return 0; } -@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZST +@@ -3550,18 +4168,20 @@ static size_t ZSTD_deriveBlockSplits(ZST * Returns combined size of all blocks (which includes headers), or a ZSTD error code. */ static size_t @@ -7368,15 +8899,26 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; - size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ -+ seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; -+ seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; ++ SeqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; ++ SeqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; + size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two -@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(Z - ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); - ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); +@@ -3577,36 +4197,37 @@ ZSTD_compressBlock_splitBlock_internal(Z + * + * See ZSTD_seqStore_resolveOffCodes() for more details. + */ +- repcodes_t dRep; +- repcodes_t cRep; +- ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); ++ Repcodes_t dRep; ++ Repcodes_t cRep; ++ ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ ZSTD_memset(nextSeqStore, 0, sizeof(SeqStore_t)); - DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", @@ -7398,8 +8940,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); - assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); -+ assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX); -+ assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize); ++ assert(zc->blockSizeMax <= ZSTD_BLOCKSIZE_MAX); ++ assert(cSizeSingleBlock <= zc->blockSizeMax + ZSTD_blockHeaderSize); return cSizeSingleBlock; } @@ -7415,7 +8957,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> srcBytesTotal += srcBytes; if (lastPartition) { /* This is the final partition, need to account for possible last literals */ -@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(Z +@@ -3621,7 +4242,8 @@ ZSTD_compressBlock_splitBlock_internal(Z op, dstCapacity, ip, srcBytes, lastBlockEntireSrc, 1 /* isPartition */); @@ -7425,21 +8967,24 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); ip += srcBytes; -@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(Z +@@ -3629,12 +4251,12 @@ ZSTD_compressBlock_splitBlock_internal(Z dstCapacity -= cSizeChunk; cSize += cSizeChunk; *currSeqStore = *nextSeqStore; - assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); -+ assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize); ++ assert(cSizeChunk <= zc->blockSizeMax + ZSTD_blockHeaderSize); } - /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes - * for the next block. + /* cRep and dRep may have diverged during the compression. + * If so, we use the dRep repcodes for the next block. */ - ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); +- ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(Repcodes_t)); return cSize; -@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* + } + +@@ -3643,21 +4265,20 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) { @@ -7447,8 +8992,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - BYTE* op = (BYTE*)dst; U32 nbSeq; size_t cSize; - DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); -@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); +- assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable); ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock"); ++ assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); if (bss == ZSTDbss_noCompress) { if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; @@ -7456,9 +9006,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); + cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); - DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock: Nocompress block"); return cSize; -@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* z + } + nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); +@@ -3673,9 +4294,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* z void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 frame) { @@ -7471,7 +9024,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ const U32 rleMaxLength = 25; size_t cSize; -@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* z +@@ -3687,11 +4308,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* z { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); @@ -7489,7 +9042,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } -@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetC +@@ -3702,7 +4327,7 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* z + &zc->appliedParams, + dst, dstCapacity, + srcSize, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + + if (frame && +@@ -3767,10 +4392,11 @@ static size_t ZSTD_compressBlock_targetC * * cSize >= blockBound(srcSize): We have expanded the block too much so * emit an uncompressed block. */ @@ -7504,7 +9066,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); -@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetC +@@ -3778,7 +4404,7 @@ static size_t ZSTD_compressBlock_targetC } } } @@ -7513,7 +9075,55 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); /* Superblock compression failed, attempt to emit a single no compress block. -@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded +@@ -3807,7 +4433,7 @@ static size_t ZSTD_compressBlock_targetC + return cSize; + } + +-static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, ++static void ZSTD_overflowCorrectIfNeeded(ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + void const* ip, +@@ -3831,39 +4457,82 @@ static void ZSTD_overflowCorrectIfNeeded + } + } + ++#include "zstd_preSplit.h" ++ ++static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings) ++{ ++ /* split level based on compression strategy, from `fast` to `btultra2` */ ++ static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 }; ++ /* note: conservatively only split full blocks (128 KB) currently. ++ * While it's possible to go lower, let's keep it simple for a first implementation. ++ * Besides, benefits of splitting are reduced when blocks are already small. ++ */ ++ if (srcSize < 128 KB || blockSizeMax < 128 KB) ++ return MIN(srcSize, blockSizeMax); ++ /* do not split incompressible data though: ++ * require verified savings to allow pre-splitting. ++ * Note: as a consequence, the first full block is not split. ++ */ ++ if (savings < 3) { ++ DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings); ++ return 128 KB; ++ } ++ /* apply @splitLevel, or use default value (which depends on @strat). ++ * note that splitting heuristic is still conditioned by @savings >= 3, ++ * so the first block will not reach this code path */ ++ if (splitLevel == 1) return 128 KB; ++ if (splitLevel == 0) { ++ assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2); ++ splitLevel = splitLevels[strat]; ++ } else { ++ assert(2 <= splitLevel && splitLevel <= 6); ++ splitLevel -= 2; ++ } ++ return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize); ++} ++ + /*! ZSTD_compress_frameChunk() : + * Compress a chunk of data into one or multiple blocks. * All blocks will be terminated, all input will be consumed. * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. * Frame is supposed already started (header already produced) @@ -7522,27 +9132,88 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, -@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(Z - ZSTD_matchState_t* const ms = &cctx->blockState.matchState; - U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); + const void* src, size_t srcSize, + U32 lastFrameChunk) + { +- size_t blockSize = cctx->blockSize; ++ size_t blockSizeMax = cctx->blockSizeMax; + size_t remaining = srcSize; + const BYTE* ip = (const BYTE*)src; + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; + U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog; ++ S64 savings = (S64)cctx->consumedSrcSize - (S64)cctx->producedCSize; + assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX); + +- DEBUGLOG(4, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize); ++ DEBUGLOG(5, "ZSTD_compress_frameChunk (srcSize=%u, blockSizeMax=%u)", (unsigned)srcSize, (unsigned)blockSizeMax); + if (cctx->appliedParams.fParams.checksumFlag && srcSize) + xxh64_update(&cctx->xxhState, src, srcSize); + + while (remaining) { +- ZSTD_matchState_t* const ms = &cctx->blockState.matchState; +- U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); +- - RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, ++ ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; ++ size_t const blockSize = ZSTD_optimalBlockSize(cctx, ++ ip, remaining, ++ blockSizeMax, ++ cctx->appliedParams.preBlockSplitter_level, ++ cctx->appliedParams.cParams.strategy, ++ savings); ++ U32 const lastBlock = lastFrameChunk & (blockSize == remaining); ++ assert(blockSize <= remaining); ++ + /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding + * additional 1. We need to revisit and change this logic to be more consistent */ + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, dstSize_tooSmall, "not enough space to store compressed block"); - if (remaining < blockSize) blockSize = remaining; -@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(Z +- if (remaining < blockSize) blockSize = remaining; + + ZSTD_overflowCorrectIfNeeded( + ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); +@@ -3899,8 +4568,23 @@ static size_t ZSTD_compress_frameChunk(Z MEM_writeLE24(op, cBlockHeader); cSize += ZSTD_blockHeaderSize; } - } + } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ ++ /* @savings is employed to ensure that splitting doesn't worsen expansion of incompressible data. ++ * Without splitting, the maximum expansion is 3 bytes per full block. ++ * An adversarial input could attempt to fudge the split detector, ++ * and make it split incompressible data, resulting in more block headers. ++ * Note that, since ZSTD_COMPRESSBOUND() assumes a worst case scenario of 1KB per block, ++ * and the splitter never creates blocks that small (current lower limit is 8 KB), ++ * there is already no risk to expand beyond ZSTD_COMPRESSBOUND() limit. ++ * But if the goal is to not expand by more than 3-bytes per 128 KB full block, ++ * then yes, it becomes possible to make the block splitter oversplit incompressible data. ++ * Using @savings, we enforce an even more conservative condition, ++ * requiring the presence of enough savings (at least 3 bytes) to authorize splitting, ++ * otherwise only full blocks are used. ++ * But being conservative is fine, ++ * since splitting barely compressible blocks is not fruitful anyway */ ++ savings += (S64)blockSize - (S64)cSize; ip += blockSize; -@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* ds + assert(remaining >= blockSize); +@@ -3919,8 +4603,10 @@ static size_t ZSTD_compress_frameChunk(Z + + + static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, +- const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID) +-{ BYTE* const op = (BYTE*)dst; ++ const ZSTD_CCtx_params* params, ++ U64 pledgedSrcSize, U32 dictID) ++{ ++ BYTE* const op = (BYTE*)dst; + U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ + U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */ + U32 const checksumFlag = params->fParams.checksumFlag>0; +@@ -4001,19 +4687,15 @@ size_t ZSTD_writeLastEmptyBlock(void* ds } } @@ -7565,7 +9236,25 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_inte +@@ -4022,7 +4704,7 @@ static size_t ZSTD_compressContinue_inte + const void* src, size_t srcSize, + U32 frame, U32 lastFrameChunk) + { +- ZSTD_matchState_t* const ms = &cctx->blockState.matchState; ++ ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; + size_t fhSize = 0; + + DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u", +@@ -4057,7 +4739,7 @@ static size_t ZSTD_compressContinue_inte + src, (BYTE const*)src + srcSize); + } + +- DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize); ++ DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSizeMax); + { size_t const cSize = frame ? + ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : + ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */); +@@ -4078,58 +4760,90 @@ static size_t ZSTD_compressContinue_inte } } @@ -7624,13 +9313,20 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*! ZSTD_loadDictionaryContent() : * @return : 0, or an error code */ -@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent - ZSTD_cwksp* ws, - ZSTD_CCtx_params const* params, - const void* src, size_t srcSize, +-static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, +- ldmState_t* ls, +- ZSTD_cwksp* ws, +- ZSTD_CCtx_params const* params, +- const void* src, size_t srcSize, - ZSTD_dictTableLoadMethod_e dtlm) -+ ZSTD_dictTableLoadMethod_e dtlm, -+ ZSTD_tableFillPurpose_e tfp) ++static size_t ++ZSTD_loadDictionaryContent(ZSTD_MatchState_t* ms, ++ ldmState_t* ls, ++ ZSTD_cwksp* ws, ++ ZSTD_CCtx_params const* params, ++ const void* src, size_t srcSize, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) { const BYTE* ip = (const BYTE*) src; const BYTE* const iend = ip + srcSize; @@ -7669,7 +9365,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* If the dictionary is too large, only load the suffix of the dictionary. */ if (srcSize > maxDictSize) { ip = iend - maxDictSize; -@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent +@@ -4138,35 +4852,59 @@ static size_t ZSTD_loadDictionaryContent } } @@ -7684,24 +9380,25 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - ms->forceNonContiguous = params->deterministicRefPrefix; - if (loadLdmDict) { -+ DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent: useRowMatchFinder=%d", (int)params->useRowMatchFinder); + + if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent: Trigger loadLdmDict"); ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); + ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); -+ } -+ ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent: ZSTD_ldm_fillHashTable completes"); + } + + /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ -+ if (params->cParams.strategy < ZSTD_btultra) { -+ U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28); ++ { U32 maxDictSize = 1U << MIN(MAX(params->cParams.hashLog + 3, params->cParams.chainLog + 1), 31); + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; + src = ip; + srcSize = maxDictSize; + } - } - ++ } ++ + ms->nextToUpdate = (U32)(ip - ms->window.base); + ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); + ms->forceNonContiguous = params->deterministicRefPrefix; @@ -7737,7 +9434,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> assert(srcSize >= HASH_READ_SIZE); if (ms->dedicatedDictSearch) { assert(ms->chainTable != NULL); -@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent +@@ -4174,7 +4912,7 @@ static size_t ZSTD_loadDictionaryContent } else { assert(params->useRowMatchFinder != ZSTD_ps_auto); if (params->useRowMatchFinder == ZSTD_ps_enable) { @@ -7746,7 +9443,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_memset(ms->tagTable, 0, tagTableSize); ZSTD_row_update(ms, iend-HASH_READ_SIZE); DEBUGLOG(4, "Using row-based hash table for lazy dict"); -@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent +@@ -4183,14 +4921,24 @@ static size_t ZSTD_loadDictionaryContent DEBUGLOG(4, "Using chain-based hash table for lazy dict"); } } @@ -7763,6 +9460,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) assert(srcSize >= HASH_READ_SIZE); ++ DEBUGLOG(4, "Fill %u bytes into the Binary Tree", (unsigned)srcSize); ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ @@ -7770,7 +9468,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> break; default: -@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressed +@@ -4233,20 +4981,19 @@ size_t ZSTD_loadCEntropy(ZSTD_compressed + { unsigned maxSymbolValue = 255; + unsigned hasZeroWeights = 1; + size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, +- dictEnd-dictPtr, &hasZeroWeights); ++ (size_t)(dictEnd-dictPtr), &hasZeroWeights); /* We only set the loaded table as valid if it contains all non-zero * weights. Otherwise, we set it to check */ @@ -7783,7 +9486,46 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> dictPtr += hufHeaderSize; } -@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZS + { unsigned offcodeLog; +- size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); ++ size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); + /* fill all offset symbols to avoid garbage at end of table */ +@@ -4261,7 +5008,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressed + + { short matchlengthNCount[MaxML+1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; +- size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); ++ size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( +@@ -4275,7 +5022,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressed + + { short litlengthNCount[MaxLL+1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; +- size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); ++ size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( +@@ -4309,7 +5056,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressed + RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); + } } } + +- return dictPtr - (const BYTE*)dict; ++ return (size_t)(dictPtr - (const BYTE*)dict); + } + + /* Dictionary format : +@@ -4322,11 +5069,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressed + * dictSize supposed >= 8 + */ + static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, ZSTD_CCtx_params const* params, const void* dict, size_t dictSize, ZSTD_dictTableLoadMethod_e dtlm, @@ -7791,7 +9533,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> void* workspace) { const BYTE* dictPtr = (const BYTE*)dict; -@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZS +@@ -4345,7 +5093,7 @@ static size_t ZSTD_loadZstdDictionary(ZS { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( @@ -7800,7 +9542,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } return dictID; } -@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_comp +@@ -4354,13 +5102,14 @@ static size_t ZSTD_loadZstdDictionary(ZS + * @return : dictID, or an error code */ + static size_t + ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + const ZSTD_CCtx_params* params, const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, ZSTD_dictTableLoadMethod_e dtlm, @@ -7808,7 +9558,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> void* workspace) { DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); -@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_comp +@@ -4373,13 +5122,13 @@ ZSTD_compress_insertDictionary(ZSTD_comp /* dict restricted modes */ if (dictContentType == ZSTD_dct_rawContent) @@ -7824,7 +9574,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); assert(0); /* impossible */ -@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_comp +@@ -4387,13 +5136,14 @@ ZSTD_compress_insertDictionary(ZSTD_comp /* dict as full zstd dictionary */ return ZSTD_loadZstdDictionary( @@ -7840,21 +9590,21 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * @return : 0, or an error code */ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, -@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_interna +@@ -4426,11 +5176,11 @@ static size_t ZSTD_compressBegin_interna cctx->blockState.prevCBlock, &cctx->blockState.matchState, &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, cdict->dictContentSize, cdict->dictContentType, dtlm, - cctx->entropyWorkspace) -+ ZSTD_tfp_forCCtx, cctx->entropyWorkspace) ++ ZSTD_tfp_forCCtx, cctx->tmpWorkspace) : ZSTD_compress_insertDictionary( cctx->blockState.prevCBlock, &cctx->blockState.matchState, &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, - dictContentType, dtlm, cctx->entropyWorkspace); -+ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace); ++ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->tmpWorkspace); FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= UINT_MAX); cctx->dictID = (U32)dictID; -@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_ +@@ -4471,11 +5221,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_ &cctxParams, pledgedSrcSize); } @@ -7869,7 +9619,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); } DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); -@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD +@@ -4483,9 +5233,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); } @@ -7886,7 +9636,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CC +@@ -4496,14 +5252,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CC { BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; @@ -7902,7 +9652,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); dstCapacity -= fhSize; op += fhSize; -@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CC +@@ -4513,8 +5268,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CC if (cctx->stage != ZSTDcs_ending) { /* write one last empty block, make it the "last" block */ U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; @@ -7914,7 +9664,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> op += ZSTD_blockHeaderSize; dstCapacity -= ZSTD_blockHeaderSize; } -@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, si +@@ -4528,7 +5284,7 @@ static size_t ZSTD_writeEpilogue(ZSTD_CC + } + + cctx->stage = ZSTDcs_created; /* return to "created but no init" status */ +- return op-ostart; ++ return (size_t)(op-ostart); + } + + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) +@@ -4537,9 +5293,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, si (void)extraCSize; } @@ -7927,7 +9686,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { size_t endResult; size_t const cSize = ZSTD_compressContinue_internal(cctx, -@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx +@@ -4563,6 +5319,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx return cSize + endResult; } @@ -7942,7 +9701,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, -@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal( +@@ -4591,7 +5355,7 @@ size_t ZSTD_compress_advanced_internal( FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, params, srcSize, ZSTDb_not_buffered) , ""); @@ -7951,7 +9710,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, -@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal( +@@ -4709,7 +5473,7 @@ static size_t ZSTD_initCDict_internal( { size_t const dictID = ZSTD_compress_insertDictionary( &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, ¶ms, cdict->dictContent, cdict->dictContentSize, @@ -7960,7 +9719,56 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= (size_t)(U32)-1); cdict->dictID = (U32)dictID; -@@ -4813,7 +5452,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( +@@ -4719,14 +5483,16 @@ static size_t ZSTD_initCDict_internal( + return 0; + } + +-static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, +- ZSTD_compressionParameters cParams, +- ZSTD_paramSwitch_e useRowMatchFinder, +- U32 enableDedicatedDictSearch, +- ZSTD_customMem customMem) ++static ZSTD_CDict* ++ZSTD_createCDict_advanced_internal(size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_compressionParameters cParams, ++ ZSTD_ParamSwitch_e useRowMatchFinder, ++ int enableDedicatedDictSearch, ++ ZSTD_customMem customMem) + { + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; ++ DEBUGLOG(3, "ZSTD_createCDict_advanced_internal (dictSize=%u)", (unsigned)dictSize); + + { size_t const workspaceSize = + ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + +@@ -4763,6 +5529,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(co + { + ZSTD_CCtx_params cctxParams; + ZSTD_memset(&cctxParams, 0, sizeof(cctxParams)); ++ DEBUGLOG(3, "ZSTD_createCDict_advanced, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + ZSTD_CCtxParams_init(&cctxParams, 0); + cctxParams.cParams = cParams; + cctxParams.customMem = customMem; +@@ -4783,7 +5550,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + ZSTD_compressionParameters cParams; + ZSTD_CDict* cdict; + +- DEBUGLOG(3, "ZSTD_createCDict_advanced2, mode %u", (unsigned)dictContentType); ++ DEBUGLOG(3, "ZSTD_createCDict_advanced2, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + if (cctxParams.enableDedicatedDictSearch) { +@@ -4802,7 +5569,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + } + +- DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch); ++ DEBUGLOG(3, "ZSTD_createCDict_advanced2: DedicatedDictSearch=%u", cctxParams.enableDedicatedDictSearch); + cctxParams.cParams = cParams; + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); + +@@ -4813,7 +5580,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( if (!cdict) return NULL; @@ -7969,7 +9777,41 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> dict, dictSize, dictLoadMethod, dictContentType, cctxParams) )) { -@@ -4908,6 +5547,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( +@@ -4867,7 +5634,7 @@ size_t ZSTD_freeCDict(ZSTD_CDict* cdict) + * workspaceSize: Use ZSTD_estimateCDictSize() + * to determine how large workspace must be. + * cParams : use ZSTD_getCParams() to transform a compression level +- * into its relevants cParams. ++ * into its relevant cParams. + * @return : pointer to ZSTD_CDict*, or NULL if error (size too small) + * Note : there is no corresponding "free" function. + * Since workspace was allocated externally, it must be freed externally. +@@ -4879,7 +5646,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) + { +- ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); ++ ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); + /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); + size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +@@ -4890,6 +5657,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + ZSTD_CDict* cdict; + ZSTD_CCtx_params params; + ++ DEBUGLOG(4, "ZSTD_initStaticCDict (dictSize==%u)", (unsigned)dictSize); + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + + { +@@ -4900,14 +5668,13 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + ZSTD_cwksp_move(&cdict->workspace, &ws); + } + +- DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u", +- (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize)); + if (workspaceSize < neededSize) return NULL; + + ZSTD_CCtxParams_init(¶ms, 0); params.cParams = cParams; params.useRowMatchFinder = useRowMatchFinder; cdict->useRowMatchFinder = useRowMatchFinder; @@ -7977,7 +9819,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, -@@ -4987,12 +5627,17 @@ size_t ZSTD_compressBegin_usingCDict_adv +@@ -4987,12 +5754,17 @@ size_t ZSTD_compressBegin_usingCDict_adv /* ZSTD_compressBegin_usingCDict() : * cdict must be != NULL */ @@ -7996,7 +9838,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*! ZSTD_compress_usingCDict_internal(): * Implementation of various ZSTD_compress_usingCDict* functions. */ -@@ -5002,7 +5647,7 @@ static size_t ZSTD_compress_usingCDict_i +@@ -5002,7 +5774,7 @@ static size_t ZSTD_compress_usingCDict_i const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) { FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ @@ -8005,7 +9847,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /*! ZSTD_compress_usingCDict_advanced(): -@@ -5199,30 +5844,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zc +@@ -5068,7 +5840,7 @@ size_t ZSTD_CStreamOutSize(void) + return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; + } + +-static ZSTD_cParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) ++static ZSTD_CParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) + { + if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) + return ZSTD_cpm_attachDict; +@@ -5199,30 +5971,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zc static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) { @@ -8013,11 +9864,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - if (hintInSize==0) hintInSize = cctx->blockSize; - return hintInSize; + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { -+ return cctx->blockSize - cctx->stableIn_notConsumed; ++ return cctx->blockSizeMax - cctx->stableIn_notConsumed; + } + assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); + { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; -+ if (hintInSize==0) hintInSize = cctx->blockSize; ++ if (hintInSize==0) hintInSize = cctx->blockSizeMax; + return hintInSize; + } } @@ -8059,7 +9910,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { assert(zcs->inBuff != NULL); assert(zcs->inBuffSize > 0); -@@ -5231,8 +5887,10 @@ static size_t ZSTD_compressStream_generi +@@ -5231,8 +6014,10 @@ static size_t ZSTD_compressStream_generi assert(zcs->outBuff != NULL); assert(zcs->outBuffSize > 0); } @@ -8071,18 +9922,29 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> assert((U32)flushMode <= (U32)ZSTD_e_end); while (someMoreWork) { -@@ -5247,7 +5905,7 @@ static size_t ZSTD_compressStream_generi +@@ -5243,12 +6028,13 @@ static size_t ZSTD_compressStream_generi + + case zcss_load: + if ( (flushMode == ZSTD_e_end) +- && ( (size_t)(oend-op) >= ZSTD_compressBound(iend-ip) /* Enough output space */ ++ && ( (size_t)(oend-op) >= ZSTD_compressBound((size_t)(iend-ip)) /* Enough output space */ || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ && (zcs->inBuffPos == 0) ) { /* shortcut to compression pass directly into output buffer */ - size_t const cSize = ZSTD_compressEnd(zcs, +- op, oend-op, ip, iend-ip); + size_t const cSize = ZSTD_compressEnd_public(zcs, - op, oend-op, ip, iend-ip); ++ op, (size_t)(oend-op), ++ ip, (size_t)(iend-ip)); DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); -@@ -5264,8 +5922,7 @@ static size_t ZSTD_compressStream_generi + ip = iend; +@@ -5262,10 +6048,9 @@ static size_t ZSTD_compressStream_generi + size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; + size_t const loaded = ZSTD_limitCopy( zcs->inBuff + zcs->inBuffPos, toLoad, - ip, iend-ip); +- ip, iend-ip); ++ ip, (size_t)(iend-ip)); zcs->inBuffPos += loaded; - if (loaded != 0) - ip += loaded; @@ -8090,14 +9952,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if ( (flushMode == ZSTD_e_continue) && (zcs->inBuffPos < zcs->inBuffTarget) ) { /* not enough input to fill full block : stop here */ -@@ -5276,6 +5933,20 @@ static size_t ZSTD_compressStream_generi +@@ -5276,16 +6061,29 @@ static size_t ZSTD_compressStream_generi /* empty */ someMoreWork = 0; break; } + } else { + assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); + if ( (flushMode == ZSTD_e_continue) -+ && ( (size_t)(iend - ip) < zcs->blockSize) ) { ++ && ( (size_t)(iend - ip) < zcs->blockSizeMax) ) { + /* can't compress a full block : stop here */ + zcs->stableIn_notConsumed = (size_t)(iend - ip); + ip = iend; /* pretend to have consumed input */ @@ -8111,19 +9973,20 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* compress current block (note : this stage cannot be stopped in the middle) */ DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); -@@ -5283,9 +5954,8 @@ static size_t ZSTD_compressStream_generi + { int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered); void* cDst; size_t cSize; - size_t oSize = oend-op; +- size_t oSize = oend-op; - size_t const iSize = inputBuffered - ? zcs->inBuffPos - zcs->inToCompress - : MIN((size_t)(iend - ip), zcs->blockSize); ++ size_t oSize = (size_t)(oend-op); + size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress -+ : MIN((size_t)(iend - ip), zcs->blockSize); ++ : MIN((size_t)(iend - ip), zcs->blockSizeMax); if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) cDst = op; /* compress into output buffer, to skip flush stage */ else -@@ -5293,9 +5963,9 @@ static size_t ZSTD_compressStream_generi +@@ -5293,34 +6091,31 @@ static size_t ZSTD_compressStream_generi if (inputBuffered) { unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); cSize = lastBlock ? @@ -8135,7 +9998,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> zcs->inBuff + zcs->inToCompress, iSize); FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); zcs->frameEnded = lastBlock; -@@ -5308,19 +5978,16 @@ static size_t ZSTD_compressStream_generi + /* prepare next block */ +- zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize; ++ zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSizeMax; + if (zcs->inBuffTarget > zcs->inBuffSize) +- zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; ++ zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSizeMax; + DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", + (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize); if (!lastBlock) assert(zcs->inBuffTarget <= zcs->inBuffSize); zcs->inToCompress = zcs->inBuffPos; @@ -8161,7 +10031,18 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } if (cDst == op) { /* no need to flush */ op += cSize; -@@ -5390,8 +6057,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* +@@ -5369,8 +6164,8 @@ static size_t ZSTD_compressStream_generi + } + } + +- input->pos = ip - istart; +- output->pos = op - ostart; ++ input->pos = (size_t)(ip - istart); ++ output->pos = (size_t)(op - ostart); + if (zcs->frameEnded) return 0; + return ZSTD_nextInputSizeHint(zcs); + } +@@ -5390,8 +6185,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* /* After a compression call set the expected input/output buffer. * This is validated at the start of the next compression call. */ @@ -8173,7 +10054,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { cctx->expectedInBuffer = *input; } -@@ -5410,22 +6079,22 @@ static size_t ZSTD_checkBufferStability( +@@ -5410,22 +6207,27 @@ static size_t ZSTD_checkBufferStability( { if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ZSTD_inBuffer const expect = cctx->expectedInBuffer; @@ -8194,6 +10075,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return 0; } ++/* ++ * If @endOp == ZSTD_e_end, @inSize becomes pledgedSrcSize. ++ * Otherwise, it's ignored. ++ * @return: 0 on success, or a ZSTD_error code otherwise. ++ */ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, ZSTD_EndDirective endOp, - size_t inSize) { @@ -8202,21 +10088,29 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_CCtx_params params = cctx->requestedParams; ZSTD_prefixDict const prefixDict = cctx->prefixDict; FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ -@@ -5439,9 +6108,9 @@ static size_t ZSTD_CCtx_init_compressStr +@@ -5438,21 +6240,24 @@ static size_t ZSTD_CCtx_init_compressStr + */ params.compressionLevel = cctx->cdict->compressionLevel; } - DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); +- DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); - if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ - { - size_t const dictSize = prefixDict.dict ++ DEBUGLOG(4, "ZSTD_CCtx_init_compressStream2 : transparent init stage"); + if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ + + { size_t const dictSize = prefixDict.dict ? prefixDict.dictSize : (cctx->cdict ? cctx->cdict->dictContentSize : 0); - ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); -@@ -5453,6 +6122,9 @@ static size_t ZSTD_CCtx_init_compressStr - params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); +- ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); ++ ZSTD_CParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); + params.cParams = ZSTD_getCParamsFromCCtxParams( + ¶ms, cctx->pledgedSrcSizePlusOne-1, + dictSize, mode); + } + +- params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); ++ params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, ¶ms.cParams); params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); + params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); @@ -8225,7 +10119,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); -@@ -5479,6 +6151,8 @@ static size_t ZSTD_CCtx_init_compressStr +@@ -5468,7 +6273,7 @@ static size_t ZSTD_CCtx_init_compressStr + /* for small input: avoid automatic flush on reaching end of block, since + * it would require to add a 3-bytes null block to end frame + */ +- cctx->inBuffTarget = cctx->blockSize + (cctx->blockSize == pledgedSrcSize); ++ cctx->inBuffTarget = cctx->blockSizeMax + (cctx->blockSizeMax == pledgedSrcSize); + } else { + cctx->inBuffTarget = 0; + } +@@ -5479,6 +6284,8 @@ static size_t ZSTD_CCtx_init_compressStr return 0; } @@ -8234,7 +10137,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input, -@@ -5493,8 +6167,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* +@@ -5493,8 +6300,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* /* transparent initialization stage */ if (cctx->streamStage == zcss_init) { @@ -8264,7 +10167,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* end of transparent initialization stage */ -@@ -5512,13 +6205,20 @@ size_t ZSTD_compressStream2_simpleArgs ( +@@ -5512,13 +6338,20 @@ size_t ZSTD_compressStream2_simpleArgs ( const void* src, size_t srcSize, size_t* srcPos, ZSTD_EndDirective endOp) { @@ -8291,7 +10194,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } size_t ZSTD_compress2(ZSTD_CCtx* cctx, -@@ -5541,6 +6241,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5541,6 +6374,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, /* Reset to the original values. */ cctx->requestedParams.inBufferMode = originalInBufferMode; cctx->requestedParams.outBufferMode = originalOutBufferMode; @@ -8299,7 +10202,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); if (result != 0) { /* compression not completed, due to lack of output space */ assert(oPos == dstCapacity); -@@ -5551,64 +6252,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5551,64 +6385,67 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, } } @@ -8310,13 +10213,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -} ZSTD_sequencePosition; - /* ZSTD_validateSequence() : - * @offCode : is presumed to follow format required by ZSTD_storeSeq() +- * @offCode : is presumed to follow format required by ZSTD_storeSeq() ++ * @offBase : must use the format required by ZSTD_storeSeq() * @returns a ZSTD error code if sequence is not valid */ static size_t -ZSTD_validateSequence(U32 offCode, U32 matchLength, - size_t posInSrc, U32 windowLog, size_t dictSize) -+ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch, ++ZSTD_validateSequence(U32 offBase, U32 matchLength, U32 minMatch, + size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) { - U32 const windowSize = 1 << windowLog; @@ -8330,7 +10234,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); - RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); + size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; -+ RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); ++ RETURN_ERROR_IF(offBase > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); + /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ + RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); return 0; @@ -8362,33 +10266,43 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of - * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. -- */ --static size_t -+size_t - ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, - ZSTD_sequencePosition* seqPos, - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++/* This function scans through an array of ZSTD_Sequence, ++ * storing the sequences it reads, until it reaches a block delimiter. ++ * Note that the block delimiter includes the last literals of the block. ++ * @blockSize must be == sum(sequence_lengths). ++ * @returns @blockSize on success, and a ZSTD_error otherwise. + */ + static size_t +-ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, +- ZSTD_sequencePosition* seqPos, +- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, - const void* src, size_t blockSize) -+ const void* src, size_t blockSize, -+ ZSTD_paramSwitch_e externalRepSearch) ++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch) { U32 idx = seqPos->idx; + U32 const startIdx = idx; BYTE const* ip = (BYTE const*)(src); const BYTE* const iend = ip + blockSize; - repcodes_t updatedRepcodes; +- repcodes_t updatedRepcodes; ++ Repcodes_t updatedRepcodes; U32 dictSize; -+ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize); ++ DEBUGLOG(5, "ZSTD_transferSequences_wBlockDelim (blockSize = %zu)", blockSize); + if (cctx->cdict) { dictSize = (U32)cctx->cdict->dictContentSize; } else if (cctx->prefixDict.dict) { -@@ -5617,25 +6315,55 @@ ZSTD_copySequencesToSeqStoreExplicitBloc +@@ -5616,27 +6453,60 @@ ZSTD_copySequencesToSeqStoreExplicitBloc + } else { dictSize = 0; } - ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); - for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { ++ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { U32 const litLength = inSeqs[idx].litLength; - U32 const ll0 = (litLength == 0); @@ -8411,8 +10325,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> seqPos->posInSrc += litLength + matchLength; - FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, - cctx->appliedParams.cParams.windowLog, dictSize), -+ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, -+ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, ++ seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ++ ZSTD_hasExtSeqProd(&cctx->appliedParams)), "Sequence validation failed"); } - RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, @@ -8422,6 +10338,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); ip += matchLength + litLength; } +- ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); ++ RETURN_ERROR_IF(idx == inSeqsSize, externalSequences_invalid, "Block delimiter not found."); + + /* If we skipped repcode search while parsing, we need to update repcodes now */ + assert(externalRepSearch != ZSTD_ps_auto); @@ -8446,40 +10364,61 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + } + } + - ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); if (inSeqs[idx].litLength) { -@@ -5644,26 +6372,15 @@ ZSTD_copySequencesToSeqStoreExplicitBloc + DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength); +@@ -5644,37 +6514,43 @@ ZSTD_copySequencesToSeqStoreExplicitBloc ip += inSeqs[idx].litLength; seqPos->posInSrc += inSeqs[idx].litLength; } - RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!"); + RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); seqPos->idx = idx+1; - return 0; +- return 0; ++ return blockSize; } -/* Returns the number of bytes to move the current read position back by. Only non-zero - * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something - * went wrong. -- * ++/* ++ * This function attempts to scan through @blockSize bytes in @src ++ * represented by the sequences in @inSeqs, ++ * storing any (partial) sequences. + * - * This function will attempt to scan through blockSize bytes represented by the sequences - * in inSeqs, storing any (partial) sequences. -- * ++ * Occasionally, we may want to reduce the actual number of bytes consumed from @src ++ * to avoid splitting a match, notably if it would produce a match smaller than MINMATCH. + * - * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to - * avoid splitting a match, or to avoid splitting a match such that it would produce a match - * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. -- */ --static size_t -+size_t - ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ * @returns the number of bytes consumed from @src, necessarily <= @blockSize. ++ * Otherwise, it may return a ZSTD error if something went wrong. + */ + static size_t +-ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, +- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, - const void* src, size_t blockSize) -+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch) ++ZSTD_transferSequences_noDelim(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch) { U32 idx = seqPos->idx; U32 startPosInSequence = seqPos->posInSequence; -@@ -5675,6 +6392,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim + U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize; + size_t dictSize; +- BYTE const* ip = (BYTE const*)(src); +- BYTE const* iend = ip + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */ +- repcodes_t updatedRepcodes; ++ const BYTE* const istart = (const BYTE*)(src); ++ const BYTE* ip = istart; ++ const BYTE* iend = istart + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */ ++ Repcodes_t updatedRepcodes; U32 bytesAdjustment = 0; U32 finalMatchSplit = 0; @@ -8489,16 +10428,17 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (cctx->cdict) { dictSize = cctx->cdict->dictContentSize; } else if (cctx->prefixDict.dict) { -@@ -5682,7 +6402,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim +@@ -5682,15 +6558,15 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim } else { dictSize = 0; } - DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); -+ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); ++ DEBUGLOG(5, "ZSTD_transferSequences_noDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); - ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { -@@ -5690,7 +6410,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim + const ZSTD_Sequence currSeq = inSeqs[idx]; U32 litLength = currSeq.litLength; U32 matchLength = currSeq.matchLength; U32 const rawOffset = currSeq.offset; @@ -8507,7 +10447,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Modify the sequence depending on where endPosInSequence lies */ if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { -@@ -5704,7 +6424,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim +@@ -5704,7 +6580,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim /* Move to the next sequence */ endPosInSequence -= currSeq.litLength + currSeq.matchLength; startPosInSequence = 0; @@ -8515,7 +10455,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } else { /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence does not reach the end of the match. So, we have to split the sequence */ -@@ -5744,21 +6463,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim +@@ -5744,58 +6619,113 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim } /* Check if this offset can be represented with a repcode */ { U32 const ll0 = (litLength == 0); @@ -8546,25 +10486,65 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); -@@ -5781,7 +6502,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim + seqPos->idx = idx; + seqPos->posInSequence = endPosInSequence; +- ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); - typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -- const void* src, size_t blockSize); -+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); - static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) - { - ZSTD_sequenceCopier sequenceCopier = NULL; -@@ -5795,6 +6516,57 @@ static ZSTD_sequenceCopier ZSTD_selectSe - return sequenceCopier; + iend -= bytesAdjustment; + if (ip != iend) { + /* Store any last literals */ +- U32 lastLLSize = (U32)(iend - ip); ++ U32 const lastLLSize = (U32)(iend - ip); + assert(ip <= iend); + DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize); + ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize); + seqPos->posInSrc += lastLLSize; + } + +- return bytesAdjustment; ++ return (size_t)(iend-istart); } +-typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, +- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize); +-static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) ++/* @seqPos represents a position within @inSeqs, ++ * it is read and updated by this function, ++ * once the goal to produce a block of size @blockSize is reached. ++ * @return: nb of bytes consumed from @src, necessarily <= @blockSize. ++ */ ++typedef size_t (*ZSTD_SequenceCopier_f)(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch); ++ ++static ZSTD_SequenceCopier_f ZSTD_selectSequenceCopier(ZSTD_SequenceFormat_e mode) + { +- ZSTD_sequenceCopier sequenceCopier = NULL; +- assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, (int)mode)); + if (mode == ZSTD_sf_explicitBlockDelimiters) { +- return ZSTD_copySequencesToSeqStoreExplicitBlockDelim; +- } else if (mode == ZSTD_sf_noBlockDelimiters) { +- return ZSTD_copySequencesToSeqStoreNoBlockDelim; ++ return ZSTD_transferSequences_wBlockDelim; + } +- assert(sequenceCopier != NULL); +- return sequenceCopier; ++ assert(mode == ZSTD_sf_noBlockDelimiters); ++ return ZSTD_transferSequences_noDelim; + } + +-/* Compress, block-by-block, all of the sequences given. +/* Discover the size of next block by searching for the delimiter. + * Note that a block delimiter **must** exist in this mode, + * otherwise it's an input error. + * The block size retrieved will be later compared to ensure it remains within bounds */ +static size_t -+blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_SequencePosition seqPos) +{ + int end = 0; + size_t blockSize = 0; @@ -8586,20 +10566,17 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + return blockSize; +} + -+/* More a "target" block size */ -+static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining) -+{ -+ int const lastBlock = (remaining <= blockSize); -+ return lastBlock ? remaining : blockSize; -+} -+ -+static size_t determine_blockSize(ZSTD_sequenceFormat_e mode, ++static size_t determine_blockSize(ZSTD_SequenceFormat_e mode, + size_t blockSize, size_t remaining, -+ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ ZSTD_SequencePosition seqPos) +{ + DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); -+ if (mode == ZSTD_sf_noBlockDelimiters) -+ return blockSize_noDelimiter(blockSize, remaining); ++ if (mode == ZSTD_sf_noBlockDelimiters) { ++ /* Note: more a "target" block size */ ++ return MIN(remaining, blockSize); ++ } ++ assert(mode == ZSTD_sf_explicitBlockDelimiters); + { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); + FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); + if (explicitBlockSize > blockSize) @@ -8610,10 +10587,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + } +} + - /* Compress, block-by-block, all of the sequences given. ++/* Compress all provided sequences, block-by-block. * * Returns the cumulative size of all compressed blocks (including their headers), -@@ -5807,9 +6579,6 @@ ZSTD_compressSequences_internal(ZSTD_CCt + * otherwise a ZSTD error. +@@ -5807,15 +6737,12 @@ ZSTD_compressSequences_internal(ZSTD_CCt const void* src, size_t srcSize) { size_t cSize = 0; @@ -8621,31 +10599,43 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - size_t blockSize; - size_t compressedSeqsSize; size_t remaining = srcSize; - ZSTD_sequencePosition seqPos = {0, 0, 0}; +- ZSTD_sequencePosition seqPos = {0, 0, 0}; ++ ZSTD_SequencePosition seqPos = {0, 0, 0}; -@@ -5829,22 +6598,29 @@ ZSTD_compressSequences_internal(ZSTD_CCt +- BYTE const* ip = (BYTE const*)src; ++ const BYTE* ip = (BYTE const*)src; + BYTE* op = (BYTE*)dst; +- ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); ++ ZSTD_SequenceCopier_f const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); + + DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize); + /* Special case: empty frame */ +@@ -5829,22 +6756,29 @@ ZSTD_compressSequences_internal(ZSTD_CCt } while (remaining) { + size_t compressedSeqsSize; size_t cBlockSize; - size_t additionalByteAdjustment; +- size_t additionalByteAdjustment; - lastBlock = remaining <= cctx->blockSize; - blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize; + size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, -+ cctx->blockSize, remaining, ++ cctx->blockSizeMax, remaining, + inSeqs, inSeqsSize, seqPos); + U32 const lastBlock = (blockSize == remaining); + FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); + assert(blockSize <= remaining); ZSTD_resetSeqStore(&cctx->seqStore); - DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize); -+ DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize); - additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize); -+ additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes); - FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); - blockSize -= additionalByteAdjustment; +- FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); +- blockSize -= additionalByteAdjustment; ++ blockSize = sequenceCopier(cctx, ++ &seqPos, inSeqs, inSeqsSize, ++ ip, blockSize, ++ cctx->appliedParams.searchForExternalRepcodes); ++ FORWARD_IF_ERROR(blockSize, "Bad sequence copy"); /* If blocks are too small, emit as a nocompress block */ - if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { @@ -8655,11 +10645,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); - DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); -+ DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); ++ DEBUGLOG(5, "Block too small (%zu): data remains uncompressed: cSize=%zu", blockSize, cBlockSize); cSize += cBlockSize; ip += blockSize; op += cBlockSize; -@@ -5853,6 +6629,7 @@ ZSTD_compressSequences_internal(ZSTD_CCt +@@ -5853,35 +6787,36 @@ ZSTD_compressSequences_internal(ZSTD_CCt continue; } @@ -8667,8 +10657,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, &cctx->appliedParams, -@@ -5861,11 +6638,11 @@ ZSTD_compressSequences_internal(ZSTD_CCt - cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, + blockSize, +- cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, ++ cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, cctx->bmi2); FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); - DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); @@ -8677,11 +10669,18 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (!cctx->isFirstBlock && ZSTD_maybeRLE(&cctx->seqStore) && - ZSTD_isRLE((BYTE const*)src, srcSize)) { +- /* We don't want to emit our first block as a RLE even if it qualifies because +- * doing so will cause the decoder (cli only) to throw a "should consume all input error." +- * This is only an issue for zstd <= v1.4.3 +- */ + ZSTD_isRLE(ip, blockSize)) { - /* We don't want to emit our first block as a RLE even if it qualifies because - * doing so will cause the decoder (cli only) to throw a "should consume all input error." - * This is only an issue for zstd <= v1.4.3 -@@ -5876,12 +6653,12 @@ ZSTD_compressSequences_internal(ZSTD_CCt ++ /* Note: don't emit the first block as RLE even if it qualifies because ++ * doing so will cause the decoder (cli <= v1.4.3 only) to throw an (invalid) error ++ * "should consume all input error." ++ */ + compressedSeqsSize = 1; + } + if (compressedSeqsSize == 0) { /* ZSTD_noCompressBlock writes the block header as well */ cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); @@ -8698,7 +10697,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } else { U32 cBlockHeader; /* Error checking and repcodes update */ -@@ -5893,11 +6670,10 @@ ZSTD_compressSequences_internal(ZSTD_CCt +@@ -5893,11 +6828,10 @@ ZSTD_compressSequences_internal(ZSTD_CCt cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); MEM_writeLE24(op, cBlockHeader); cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; @@ -8711,7 +10710,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (lastBlock) { break; -@@ -5908,12 +6684,15 @@ ZSTD_compressSequences_internal(ZSTD_CCt +@@ -5908,41 +6842,50 @@ ZSTD_compressSequences_internal(ZSTD_CCt dstCapacity -= cBlockSize; cctx->isFirstBlock = 0; } @@ -8728,20 +10727,582 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const ZSTD_Sequence* inSeqs, size_t inSeqsSize, const void* src, size_t srcSize) { -@@ -5923,7 +6702,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* - size_t frameHeaderSize = 0; + BYTE* op = (BYTE*)dst; + size_t cSize = 0; +- size_t compressedBlocksSize = 0; +- size_t frameHeaderSize = 0; /* Transparent initialization stage, same as compressStream2() */ - DEBUGLOG(3, "ZSTD_compressSequences()"); -+ DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity); ++ DEBUGLOG(4, "ZSTD_compressSequences (nbSeqs=%zu,dstCapacity=%zu)", inSeqsSize, dstCapacity); assert(cctx != NULL); FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); ++ /* Begin writing output, starting with frame header */ -@@ -5951,26 +6730,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* +- frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, srcSize, cctx->dictID); +- op += frameHeaderSize; +- dstCapacity -= frameHeaderSize; +- cSize += frameHeaderSize; ++ { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, ++ &cctx->appliedParams, srcSize, cctx->dictID); ++ op += frameHeaderSize; ++ assert(frameHeaderSize <= dstCapacity); ++ dstCapacity -= frameHeaderSize; ++ cSize += frameHeaderSize; ++ } + if (cctx->appliedParams.fParams.checksumFlag && srcSize) { + xxh64_update(&cctx->xxhState, src, srcSize); + } +- /* cSize includes block header size and compressed sequences size */ +- compressedBlocksSize = ZSTD_compressSequences_internal(cctx, ++ ++ /* Now generate compressed blocks */ ++ { size_t const cBlocksSize = ZSTD_compressSequences_internal(cctx, + op, dstCapacity, + inSeqs, inSeqsSize, + src, srcSize); +- FORWARD_IF_ERROR(compressedBlocksSize, "Compressing blocks failed!"); +- cSize += compressedBlocksSize; +- dstCapacity -= compressedBlocksSize; ++ FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); ++ cSize += cBlocksSize; ++ assert(cBlocksSize <= dstCapacity); ++ dstCapacity -= cBlocksSize; ++ } + ++ /* Complete with frame checksum, if needed */ + if (cctx->appliedParams.fParams.checksumFlag) { + U32 const checksum = (U32) xxh64_digest(&cctx->xxhState); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); +@@ -5951,26 +6894,557 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* cSize += 4; } - DEBUGLOG(3, "Final compressed size: %zu", cSize); ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); ++ return cSize; ++} ++ ++ ++#if defined(__AVX2__) ++ ++#include <immintrin.h> /* AVX2 intrinsics */ ++ ++/* ++ * Convert 2 sequences per iteration, using AVX2 intrinsics: ++ * - offset -> offBase = offset + 2 ++ * - litLength -> (U16) litLength ++ * - matchLength -> (U16)(matchLength - 3) ++ * - rep is ignored ++ * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]). ++ * ++ * At the end, instead of extracting two __m128i, ++ * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1, ++ * then store the lower 16 bytes in one go. ++ * ++ * @returns 0 on succes, with no long length detected ++ * @returns > 0 if there is one long length (> 65535), ++ * indicating the position, and type. ++ */ ++static size_t convertSequences_noRepcodes( ++ SeqDef* dstSeqs, ++ const ZSTD_Sequence* inSeqs, ++ size_t nbSequences) ++{ ++ /* ++ * addition: ++ * For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0) ++ */ ++ const __m256i addition = _mm256_setr_epi32( ++ ZSTD_REP_NUM, 0, -MINMATCH, 0, /* for sequence i */ ++ ZSTD_REP_NUM, 0, -MINMATCH, 0 /* for sequence i+1 */ ++ ); ++ ++ /* limit: check if there is a long length */ ++ const __m256i limit = _mm256_set1_epi32(65535); ++ ++ /* ++ * shuffle mask for byte-level rearrangement in each 128-bit half: ++ * ++ * Input layout (after addition) per 128-bit half: ++ * [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ] ++ * We only need: ++ * offBase (4 bytes) = offset+2 ++ * litLength (2 bytes) = low 2 bytes of litLength ++ * mlBase (2 bytes) = low 2 bytes of (matchLength) ++ * => Bytes [0..3, 4..5, 8..9], zero the rest. ++ */ ++ const __m256i mask = _mm256_setr_epi8( ++ /* For the lower 128 bits => sequence i */ ++ 0, 1, 2, 3, /* offset+2 */ ++ 4, 5, /* litLength (16 bits) */ ++ 8, 9, /* matchLength (16 bits) */ ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, ++ ++ /* For the upper 128 bits => sequence i+1 */ ++ 16,17,18,19, /* offset+2 */ ++ 20,21, /* litLength */ ++ 24,25, /* matchLength */ ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80 ++ ); ++ ++ /* ++ * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8). ++ * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3]. ++ * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1. ++ */ ++#define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ ++ ++ size_t longLen = 0, i = 0; ++ ++ /* AVX permutation depends on the specific definition of target structures */ ++ ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0); ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4); ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); ++ ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8); ++ ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0); ++ ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4); ++ ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6); ++ ++ /* Process 2 sequences per loop iteration */ ++ for (; i + 1 < nbSequences; i += 2) { ++ /* Load 2 ZSTD_Sequence (32 bytes) */ ++ __m256i vin = _mm256_loadu_si256((const __m256i*)(const void*)&inSeqs[i]); ++ ++ /* Add {2, 0, -3, 0} in each 128-bit half */ ++ __m256i vadd = _mm256_add_epi32(vin, addition); ++ ++ /* Check for long length */ ++ __m256i ll_cmp = _mm256_cmpgt_epi32(vadd, limit); /* 0xFFFFFFFF for element > 65535 */ ++ int ll_res = _mm256_movemask_epi8(ll_cmp); ++ ++ /* Shuffle bytes so each half gives us the 8 bytes we need */ ++ __m256i vshf = _mm256_shuffle_epi8(vadd, mask); ++ /* ++ * Now: ++ * Lane0 = seq0's 8 bytes ++ * Lane1 = 0 ++ * Lane2 = seq1's 8 bytes ++ * Lane3 = 0 ++ */ ++ ++ /* Permute 64-bit lanes => move Lane2 down into Lane1. */ ++ __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8); ++ /* ++ * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1]. ++ * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them. ++ */ ++ ++ /* Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ ++ _mm_storeu_si128((__m128i *)(void*)&dstSeqs[i], _mm256_castsi256_si128(vperm)); ++ /* ++ * This writes out 16 bytes total: ++ * - offset 0..7 => seq0 (offBase, litLength, mlBase) ++ * - offset 8..15 => seq1 (offBase, litLength, mlBase) ++ */ ++ ++ /* check (unlikely) long lengths > 65535 ++ * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27] ++ * => combined mask = 0x0FF00FF0 ++ */ ++ if (UNLIKELY((ll_res & 0x0FF00FF0) != 0)) { ++ /* long length detected: let's figure out which one*/ ++ if (inSeqs[i].matchLength > 65535+MINMATCH) { ++ assert(longLen == 0); ++ longLen = i + 1; ++ } ++ if (inSeqs[i].litLength > 65535) { ++ assert(longLen == 0); ++ longLen = i + nbSequences + 1; ++ } ++ if (inSeqs[i+1].matchLength > 65535+MINMATCH) { ++ assert(longLen == 0); ++ longLen = i + 1 + 1; ++ } ++ if (inSeqs[i+1].litLength > 65535) { ++ assert(longLen == 0); ++ longLen = i + 1 + nbSequences + 1; ++ } ++ } ++ } ++ ++ /* Handle leftover if @nbSequences is odd */ ++ if (i < nbSequences) { ++ /* process last sequence */ ++ assert(i == nbSequences - 1); ++ dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); ++ dstSeqs[i].litLength = (U16)inSeqs[i].litLength; ++ dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); ++ /* check (unlikely) long lengths > 65535 */ ++ if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) { ++ assert(longLen == 0); ++ longLen = i + 1; ++ } ++ if (UNLIKELY(inSeqs[i].litLength > 65535)) { ++ assert(longLen == 0); ++ longLen = i + nbSequences + 1; ++ } ++ } ++ ++ return longLen; ++} ++ ++/* the vector implementation could also be ported to SSSE3, ++ * but since this implementation is targeting modern systems (>= Sapphire Rapid), ++ * it's not useful to develop and maintain code for older pre-AVX2 platforms */ ++ ++#else /* no AVX2 */ ++ ++static size_t convertSequences_noRepcodes( ++ SeqDef* dstSeqs, ++ const ZSTD_Sequence* inSeqs, ++ size_t nbSequences) ++{ ++ size_t longLen = 0; ++ size_t n; ++ for (n=0; n<nbSequences; n++) { ++ dstSeqs[n].offBase = OFFSET_TO_OFFBASE(inSeqs[n].offset); ++ dstSeqs[n].litLength = (U16)inSeqs[n].litLength; ++ dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH); ++ /* check for long length > 65535 */ ++ if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) { ++ assert(longLen == 0); ++ longLen = n + 1; ++ } ++ if (UNLIKELY(inSeqs[n].litLength > 65535)) { ++ assert(longLen == 0); ++ longLen = n + nbSequences + 1; ++ } ++ } ++ return longLen; ++} ++ ++#endif ++ ++/* ++ * Precondition: Sequences must end on an explicit Block Delimiter ++ * @return: 0 on success, or an error code. ++ * Note: Sequence validation functionality has been disabled (removed). ++ * This is helpful to generate a lean main pipeline, improving performance. ++ * It may be re-inserted later. ++ */ ++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, ++ const ZSTD_Sequence* const inSeqs, size_t nbSequences, ++ int repcodeResolution) ++{ ++ Repcodes_t updatedRepcodes; ++ size_t seqNb = 0; ++ ++ DEBUGLOG(5, "ZSTD_convertBlockSequences (nbSequences = %zu)", nbSequences); ++ ++ RETURN_ERROR_IF(nbSequences >= cctx->seqStore.maxNbSeq, externalSequences_invalid, ++ "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); ++ ++ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ ++ /* check end condition */ ++ assert(nbSequences >= 1); ++ assert(inSeqs[nbSequences-1].matchLength == 0); ++ assert(inSeqs[nbSequences-1].offset == 0); ++ ++ /* Convert Sequences from public format to internal format */ ++ if (!repcodeResolution) { ++ size_t const longl = convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences-1); ++ cctx->seqStore.sequences = cctx->seqStore.sequencesStart + nbSequences-1; ++ if (longl) { ++ DEBUGLOG(5, "long length"); ++ assert(cctx->seqStore.longLengthType == ZSTD_llt_none); ++ if (longl <= nbSequences-1) { ++ DEBUGLOG(5, "long match length detected at pos %zu", longl-1); ++ cctx->seqStore.longLengthType = ZSTD_llt_matchLength; ++ cctx->seqStore.longLengthPos = (U32)(longl-1); ++ } else { ++ DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences); ++ assert(longl <= 2* (nbSequences-1)); ++ cctx->seqStore.longLengthType = ZSTD_llt_literalLength; ++ cctx->seqStore.longLengthPos = (U32)(longl-(nbSequences-1)-1); ++ } ++ } ++ } else { ++ for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { ++ U32 const litLength = inSeqs[seqNb].litLength; ++ U32 const matchLength = inSeqs[seqNb].matchLength; ++ U32 const ll0 = (litLength == 0); ++ U32 const offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); ++ ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } ++ } ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ if (!repcodeResolution && nbSequences > 1) { ++ U32* const rep = updatedRepcodes.rep; ++ ++ if (nbSequences >= 4) { ++ U32 lastSeqIdx = (U32)nbSequences - 2; /* index of last full sequence */ ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (nbSequences == 3) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[0].offset; ++ rep[0] = inSeqs[1].offset; ++ } else { ++ assert(nbSequences == 2); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[0].offset; ++ } ++ } ++ ++ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); ++ ++ return 0; ++} ++ ++#if defined(ZSTD_ARCH_X86_AVX2) ++ ++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) ++{ ++ size_t i; ++ __m256i const zeroVec = _mm256_setzero_si256(); ++ __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ ++ ZSTD_ALIGNED(32) U32 tmp[8]; /* temporary buffer for reduction */ ++ size_t mSum = 0, lSum = 0; ++ ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); ++ ++ /* Process 2 structs (32 bytes) at a time */ ++ for (i = 0; i + 2 <= nbSeqs; i += 2) { ++ /* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */ ++ __m256i data = _mm256_loadu_si256((const __m256i*)(const void*)&seqs[i]); ++ /* check end of block signal */ ++ __m256i cmp = _mm256_cmpeq_epi32(data, zeroVec); ++ int cmp_res = _mm256_movemask_epi8(cmp); ++ /* indices for match lengths correspond to bits [8..11], [24..27] ++ * => combined mask = 0x0F000F00 */ ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); ++ if (cmp_res & 0x0F000F00) break; ++ /* Accumulate in sumVec */ ++ sumVec = _mm256_add_epi32(sumVec, data); ++ } ++ ++ /* Horizontal reduction */ ++ _mm256_store_si256((__m256i*)tmp, sumVec); ++ lSum = tmp[1] + tmp[5]; ++ mSum = tmp[2] + tmp[6]; ++ ++ /* Handle the leftover */ ++ for (; i < nbSeqs; i++) { ++ lSum += seqs[i].litLength; ++ mSum += seqs[i].matchLength; ++ if (seqs[i].matchLength == 0) break; /* end of block */ ++ } ++ ++ if (i==nbSeqs) { ++ /* reaching end of sequences: end of block signal was not present */ ++ BlockSummary bs; ++ bs.nbSequences = ERROR(externalSequences_invalid); ++ return bs; ++ } ++ { BlockSummary bs; ++ bs.nbSequences = i+1; ++ bs.blockSize = lSum + mSum; ++ bs.litSize = lSum; ++ return bs; ++ } ++} ++ ++#else ++ ++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) ++{ ++ size_t totalMatchSize = 0; ++ size_t litSize = 0; ++ size_t n; ++ assert(seqs); ++ for (n=0; n<nbSeqs; n++) { ++ totalMatchSize += seqs[n].matchLength; ++ litSize += seqs[n].litLength; ++ if (seqs[n].matchLength == 0) { ++ assert(seqs[n].offset == 0); ++ break; ++ } ++ } ++ if (n==nbSeqs) { ++ BlockSummary bs; ++ bs.nbSequences = ERROR(externalSequences_invalid); ++ return bs; ++ } ++ { BlockSummary bs; ++ bs.nbSequences = n+1; ++ bs.blockSize = litSize + totalMatchSize; ++ bs.litSize = litSize; ++ return bs; ++ } ++} ++#endif ++ ++ ++static size_t ++ZSTD_compressSequencesAndLiterals_internal(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t nbSequences, ++ const void* literals, size_t litSize, size_t srcSize) ++{ ++ size_t remaining = srcSize; ++ size_t cSize = 0; ++ BYTE* op = (BYTE*)dst; ++ int const repcodeResolution = (cctx->appliedParams.searchForExternalRepcodes == ZSTD_ps_enable); ++ assert(cctx->appliedParams.searchForExternalRepcodes != ZSTD_ps_auto); ++ ++ DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals_internal: nbSeqs=%zu, litSize=%zu", nbSequences, litSize); ++ RETURN_ERROR_IF(nbSequences == 0, externalSequences_invalid, "Requires at least 1 end-of-block"); ++ ++ /* Special case: empty frame */ ++ if ((nbSequences == 1) && (inSeqs[0].litLength == 0)) { ++ U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1); ++ RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "No room for empty frame block header"); ++ MEM_writeLE24(op, cBlockHeader24); ++ op += ZSTD_blockHeaderSize; ++ dstCapacity -= ZSTD_blockHeaderSize; ++ cSize += ZSTD_blockHeaderSize; ++ } ++ ++ while (nbSequences) { ++ size_t compressedSeqsSize, cBlockSize, conversionStatus; ++ BlockSummary const block = ZSTD_get1BlockSummary(inSeqs, nbSequences); ++ U32 const lastBlock = (block.nbSequences == nbSequences); ++ FORWARD_IF_ERROR(block.nbSequences, "Error while trying to determine nb of sequences for a block"); ++ assert(block.nbSequences <= nbSequences); ++ RETURN_ERROR_IF(block.litSize > litSize, externalSequences_invalid, "discrepancy: Sequences require more literals than present in buffer"); ++ ZSTD_resetSeqStore(&cctx->seqStore); ++ ++ conversionStatus = ZSTD_convertBlockSequences(cctx, ++ inSeqs, block.nbSequences, ++ repcodeResolution); ++ FORWARD_IF_ERROR(conversionStatus, "Bad sequence conversion"); ++ inSeqs += block.nbSequences; ++ nbSequences -= block.nbSequences; ++ remaining -= block.blockSize; ++ ++ /* Note: when blockSize is very small, other variant send it uncompressed. ++ * Here, we still send the sequences, because we don't have the original source to send it uncompressed. ++ * One could imagine in theory reproducing the source from the sequences, ++ * but that's complex and costly memory intensive, and goes against the objectives of this variant. */ ++ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); ++ ++ compressedSeqsSize = ZSTD_entropyCompressSeqStore_internal( ++ op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, ++ literals, block.litSize, ++ &cctx->seqStore, ++ &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, ++ &cctx->appliedParams, ++ cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, ++ cctx->bmi2); ++ FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); ++ /* note: the spec forbids for any compressed block to be larger than maximum block size */ ++ if (compressedSeqsSize > cctx->blockSizeMax) compressedSeqsSize = 0; ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); ++ litSize -= block.litSize; ++ literals = (const char*)literals + block.litSize; ++ ++ /* Note: difficult to check source for RLE block when only Literals are provided, ++ * but it could be considered from analyzing the sequence directly */ ++ ++ if (compressedSeqsSize == 0) { ++ /* Sending uncompressed blocks is out of reach, because the source is not provided. ++ * In theory, one could use the sequences to regenerate the source, like a decompressor, ++ * but it's complex, and memory hungry, killing the purpose of this variant. ++ * Current outcome: generate an error code. ++ */ ++ RETURN_ERROR(cannotProduce_uncompressedBlock, "ZSTD_compressSequencesAndLiterals cannot generate an uncompressed block"); ++ } else { ++ U32 cBlockHeader; ++ assert(compressedSeqsSize > 1); /* no RLE */ ++ /* Error checking and repcodes update */ ++ ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); ++ if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) ++ cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; ++ ++ /* Write block header into beginning of block*/ ++ cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); ++ MEM_writeLE24(op, cBlockHeader); ++ cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); ++ } ++ ++ cSize += cBlockSize; ++ op += cBlockSize; ++ dstCapacity -= cBlockSize; ++ cctx->isFirstBlock = 0; ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); ++ ++ if (lastBlock) { ++ assert(nbSequences == 0); ++ break; ++ } ++ } ++ ++ RETURN_ERROR_IF(litSize != 0, externalSequences_invalid, "literals must be entirely and exactly consumed"); ++ RETURN_ERROR_IF(remaining != 0, externalSequences_invalid, "Sequences must represent a total of exactly srcSize=%zu", srcSize); ++ DEBUGLOG(4, "cSize final total: %zu", cSize); ++ return cSize; ++} ++ ++size_t ++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* literals, size_t litSize, size_t litCapacity, ++ size_t decompressedSize) ++{ ++ BYTE* op = (BYTE*)dst; ++ size_t cSize = 0; ++ ++ /* Transparent initialization stage, same as compressStream2() */ ++ DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals (dstCapacity=%zu)", dstCapacity); ++ assert(cctx != NULL); ++ if (litCapacity < litSize) { ++ RETURN_ERROR(workSpace_tooSmall, "literals buffer is not large enough: must be at least 8 bytes larger than litSize (risk of read out-of-bound)"); ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, decompressedSize), "CCtx initialization failed"); ++ ++ if (cctx->appliedParams.blockDelimiters == ZSTD_sf_noBlockDelimiters) { ++ RETURN_ERROR(frameParameter_unsupported, "This mode is only compatible with explicit delimiters"); ++ } ++ if (cctx->appliedParams.validateSequences) { ++ RETURN_ERROR(parameter_unsupported, "This mode is not compatible with Sequence validation"); ++ } ++ if (cctx->appliedParams.fParams.checksumFlag) { ++ RETURN_ERROR(frameParameter_unsupported, "this mode is not compatible with frame checksum"); ++ } ++ ++ /* Begin writing output, starting with frame header */ ++ { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, ++ &cctx->appliedParams, decompressedSize, cctx->dictID); ++ op += frameHeaderSize; ++ assert(frameHeaderSize <= dstCapacity); ++ dstCapacity -= frameHeaderSize; ++ cSize += frameHeaderSize; ++ } ++ ++ /* Now generate compressed blocks */ ++ { size_t const cBlocksSize = ZSTD_compressSequencesAndLiterals_internal(cctx, ++ op, dstCapacity, ++ inSeqs, inSeqsSize, ++ literals, litSize, decompressedSize); ++ FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); ++ cSize += cBlocksSize; ++ assert(cBlocksSize <= dstCapacity); ++ dstCapacity -= cBlocksSize; ++ } ++ + DEBUGLOG(4, "Final compressed size: %zu", cSize); return cSize; } @@ -8765,7 +11326,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); } - +- size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) { - ZSTD_inBuffer input = { NULL, 0, 0 }; @@ -8776,7 +11337,27 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ /* single thread mode : attempt to calculate remaining to flush more precisely */ { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; -@@ -6092,7 +6879,7 @@ static ZSTD_compressionParameters ZSTD_g +@@ -6046,7 +7520,7 @@ static void ZSTD_dedicatedDictSearch_rev + } + } + +-static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) ++static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) + { + switch (mode) { + case ZSTD_cpm_unknown: +@@ -6070,8 +7544,8 @@ static U64 ZSTD_getCParamRowSize(U64 src + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown. + * Use dictSize == 0 for unknown or unused. +- * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_cParamMode_e`. */ +-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) ++ * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_CParamMode_e`. */ ++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) + { + U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode); + U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); +@@ -6092,7 +7566,7 @@ static ZSTD_compressionParameters ZSTD_g cp.targetLength = (unsigned)(-clampedCompressionLevel); } /* refine parameters based on srcSize & dictSize */ @@ -8785,7 +11366,24 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } -@@ -6127,3 +6914,29 @@ ZSTD_parameters ZSTD_getParams(int compr +@@ -6109,7 +7583,9 @@ ZSTD_compressionParameters ZSTD_getCPara + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) { ++static ZSTD_parameters ++ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) ++{ + ZSTD_parameters params; + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode); + DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); +@@ -6123,7 +7599,34 @@ static ZSTD_parameters ZSTD_getParams_in + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +-ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { ++ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) ++{ if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); } @@ -8793,8 +11391,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +void ZSTD_registerSequenceProducer( + ZSTD_CCtx* zc, + void* extSeqProdState, -+ ZSTD_sequenceProducer_F extSeqProdFunc -+) { ++ ZSTD_sequenceProducer_F extSeqProdFunc) ++{ + assert(zc != NULL); + ZSTD_CCtxParams_registerSequenceProducer( + &zc->requestedParams, extSeqProdState, extSeqProdFunc @@ -8804,8 +11402,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +void ZSTD_CCtxParams_registerSequenceProducer( + ZSTD_CCtx_params* params, + void* extSeqProdState, -+ ZSTD_sequenceProducer_F extSeqProdFunc -+) { ++ ZSTD_sequenceProducer_F extSeqProdFunc) ++{ + assert(params != NULL); + if (extSeqProdFunc != NULL) { + params->extSeqProdFunc = extSeqProdFunc; @@ -8825,14 +11423,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -20,6 +21,7 @@ +@@ -20,7 +21,8 @@ ***************************************/ #include "../common/zstd_internal.h" #include "zstd_cwksp.h" +- +#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ - ++#include "zstd_preSplit.h" /* ZSTD_SLIPBLOCK_WORKSPACESIZE */ /*-************************************* + * Constants @@ -32,7 +34,7 @@ It's not a big deal though : candidate will just be sorted again. Additionally, candidate position 1 will be lost. @@ -8842,7 +11442,100 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ -@@ -111,12 +113,13 @@ typedef struct { +@@ -76,6 +78,70 @@ typedef struct { + } ZSTD_entropyCTables_t; + + /* ********************************************* ++* Sequences * ++***********************************************/ ++typedef struct SeqDef_s { ++ U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */ ++ U16 litLength; ++ U16 mlBase; /* mlBase == matchLength - MINMATCH */ ++} SeqDef; ++ ++/* Controls whether seqStore has a single "long" litLength or matchLength. See SeqStore_t. */ ++typedef enum { ++ ZSTD_llt_none = 0, /* no longLengthType */ ++ ZSTD_llt_literalLength = 1, /* represents a long literal */ ++ ZSTD_llt_matchLength = 2 /* represents a long match */ ++} ZSTD_longLengthType_e; ++ ++typedef struct { ++ SeqDef* sequencesStart; ++ SeqDef* sequences; /* ptr to end of sequences */ ++ BYTE* litStart; ++ BYTE* lit; /* ptr to end of literals */ ++ BYTE* llCode; ++ BYTE* mlCode; ++ BYTE* ofCode; ++ size_t maxNbSeq; ++ size_t maxNbLit; ++ ++ /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength ++ * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment ++ * the existing value of the litLength or matchLength by 0x10000. ++ */ ++ ZSTD_longLengthType_e longLengthType; ++ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ ++} SeqStore_t; ++ ++typedef struct { ++ U32 litLength; ++ U32 matchLength; ++} ZSTD_SequenceLength; ++ ++/* ++ * Returns the ZSTD_SequenceLength for the given sequences. It handles the decoding of long sequences ++ * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. ++ */ ++MEM_STATIC ZSTD_SequenceLength ZSTD_getSequenceLength(SeqStore_t const* seqStore, SeqDef const* seq) ++{ ++ ZSTD_SequenceLength seqLen; ++ seqLen.litLength = seq->litLength; ++ seqLen.matchLength = seq->mlBase + MINMATCH; ++ if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { ++ if (seqStore->longLengthType == ZSTD_llt_literalLength) { ++ seqLen.litLength += 0x10000; ++ } ++ if (seqStore->longLengthType == ZSTD_llt_matchLength) { ++ seqLen.matchLength += 0x10000; ++ } ++ } ++ return seqLen; ++} ++ ++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ ++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ ++ ++ ++/* ********************************************* + * Entropy buffer statistics structs and funcs * + ***********************************************/ + /* ZSTD_hufCTablesMetadata_t : +@@ -84,7 +150,7 @@ typedef struct { + * hufDesSize refers to the size of huffman tree description in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */ + typedef struct { +- symbolEncodingType_e hType; ++ SymbolEncodingType_e hType; + BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; + size_t hufDesSize; + } ZSTD_hufCTablesMetadata_t; +@@ -95,9 +161,9 @@ typedef struct { + * fseTablesSize refers to the size of fse tables in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */ + typedef struct { +- symbolEncodingType_e llType; +- symbolEncodingType_e ofType; +- symbolEncodingType_e mlType; ++ SymbolEncodingType_e llType; ++ SymbolEncodingType_e ofType; ++ SymbolEncodingType_e mlType; + BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; + size_t fseTablesSize; + size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +@@ -111,12 +177,13 @@ typedef struct { /* ZSTD_buildBlockEntropyStats() : * Builds entropy for the block. * @return : 0 on success or error code */ @@ -8853,7 +11546,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - ZSTD_entropyCTablesMetadata_t* entropyMetadata, - void* workspace, size_t wkspSize); +size_t ZSTD_buildBlockEntropyStats( -+ const seqStore_t* seqStorePtr, ++ const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, @@ -8862,17 +11555,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* ******************************* * Compression internals structs * -@@ -142,26 +145,33 @@ typedef struct { +@@ -140,28 +207,29 @@ typedef struct { + stopped. posInSequence <= seq[pos].litLength + seq[pos].matchLength */ + size_t size; /* The number of sequences. <= capacity. */ size_t capacity; /* The capacity starting from `seq` pointer */ - } rawSeqStore_t; +-} rawSeqStore_t; ++} RawSeqStore_t; -+typedef struct { -+ U32 idx; /* Index in array of ZSTD_Sequence */ -+ U32 posInSequence; /* Position within sequence at idx */ -+ size_t posInSrc; /* Number of bytes given by sequences provided so far */ -+} ZSTD_sequencePosition; -+ - UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; +-UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; ++UNUSED_ATTR static const RawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; typedef struct { - int price; @@ -8903,7 +11594,30 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32 litSum; /* nb of literals */ U32 litLengthSum; /* nb of litLength codes */ -@@ -212,8 +222,10 @@ struct ZSTD_matchState_t { +@@ -173,7 +241,7 @@ typedef struct { + U32 offCodeSumBasePrice; /* to compare to log2(offreq) */ + ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */ + const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */ +- ZSTD_paramSwitch_e literalCompressionMode; ++ ZSTD_ParamSwitch_e literalCompressionMode; + } optState_t; + + typedef struct { +@@ -195,11 +263,11 @@ typedef struct { + + #define ZSTD_WINDOW_START_INDEX 2 + +-typedef struct ZSTD_matchState_t ZSTD_matchState_t; ++typedef struct ZSTD_MatchState_t ZSTD_MatchState_t; + + #define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache for row-based matchfinder */ + +-struct ZSTD_matchState_t { ++struct ZSTD_MatchState_t { + ZSTD_window_t window; /* State for window round buffer management */ + U32 loadedDictEnd; /* index of end of dictionary, within context's referential. + * When loadedDictEnd != 0, a dictionary is in use, and still valid. +@@ -212,28 +280,42 @@ struct ZSTD_matchState_t { U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ @@ -8915,10 +11629,20 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32* hashTable; U32* hashTable3; -@@ -228,6 +240,18 @@ struct ZSTD_matchState_t { - const ZSTD_matchState_t* dictMatchState; + U32* chainTable; + +- U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ ++ int forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ + + int dedicatedDictSearch; /* Indicates whether this matchState is using the + * dedicated dictionary search structure. + */ + optState_t opt; /* optimal parser state */ +- const ZSTD_matchState_t* dictMatchState; ++ const ZSTD_MatchState_t* dictMatchState; ZSTD_compressionParameters cParams; - const rawSeqStore_t* ldmSeqStore; +- const rawSeqStore_t* ldmSeqStore; ++ const RawSeqStore_t* ldmSeqStore; + + /* Controls prefetching in some dictMatchState matchfinders. + * This behavior is controlled from the cctx ms. @@ -8934,13 +11658,69 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> }; typedef struct { -@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s { + ZSTD_compressedBlockState_t* prevCBlock; + ZSTD_compressedBlockState_t* nextCBlock; +- ZSTD_matchState_t matchState; ++ ZSTD_MatchState_t matchState; + } ZSTD_blockState_t; + + typedef struct { +@@ -260,7 +342,7 @@ typedef struct { + } ldmState_t; + + typedef struct { +- ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */ ++ ZSTD_ParamSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */ + U32 hashLog; /* Log size of hashTable */ + U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */ + U32 minMatchLength; /* Minimum match length */ +@@ -291,7 +373,7 @@ struct ZSTD_CCtx_params_s { + * There is no guarantee that hint is close to actual source size */ + + ZSTD_dictAttachPref_e attachDictPref; +- ZSTD_paramSwitch_e literalCompressionMode; ++ ZSTD_ParamSwitch_e literalCompressionMode; + + /* Multithreading: used to pass parameters to mtctx */ + int nbWorkers; +@@ -310,24 +392,54 @@ struct ZSTD_CCtx_params_s { + ZSTD_bufferMode_e outBufferMode; + + /* Sequence compression API */ +- ZSTD_sequenceFormat_e blockDelimiters; ++ ZSTD_SequenceFormat_e blockDelimiters; + int validateSequences; + +- /* Block splitting */ +- ZSTD_paramSwitch_e useBlockSplitter; ++ /* Block splitting ++ * @postBlockSplitter executes split analysis after sequences are produced, ++ * it's more accurate but consumes more resources. ++ * @preBlockSplitter_level splits before knowing sequences, ++ * it's more approximative but also cheaper. ++ * Valid @preBlockSplitter_level values range from 0 to 6 (included). ++ * 0 means auto, 1 means do not split, ++ * then levels are sorted in increasing cpu budget, from 2 (fastest) to 6 (slowest). ++ * Highest @preBlockSplitter_level combines well with @postBlockSplitter. ++ */ ++ ZSTD_ParamSwitch_e postBlockSplitter; ++ int preBlockSplitter_level; ++ ++ /* Adjust the max block size*/ ++ size_t maxBlockSize; + + /* Param for deciding whether to use row-based matchfinder */ +- ZSTD_paramSwitch_e useRowMatchFinder; ++ ZSTD_ParamSwitch_e useRowMatchFinder; + + /* Always load a dictionary in ext-dict mode (not prefix mode)? */ + int deterministicRefPrefix; /* Internal use, for createCCtxParams() and freeCCtxParams() only */ ZSTD_customMem customMem; + + /* Controls prefetching in some dictMatchState matchfinders */ -+ ZSTD_paramSwitch_e prefetchCDictTables; ++ ZSTD_ParamSwitch_e prefetchCDictTables; + + /* Controls whether zstd will fall back to an internal matchfinder + * if the external matchfinder returns an error code. */ @@ -8952,15 +11732,61 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + void* extSeqProdState; + ZSTD_sequenceProducer_F extSeqProdFunc; + -+ /* Adjust the max block size*/ -+ size_t maxBlockSize; -+ + /* Controls repcode search in external sequence parsing */ -+ ZSTD_paramSwitch_e searchForExternalRepcodes; ++ ZSTD_ParamSwitch_e searchForExternalRepcodes; }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) -@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s { + #define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE) ++#define TMP_WORKSPACE_SIZE (MAX(ENTROPY_WORKSPACE_SIZE, ZSTD_SLIPBLOCK_WORKSPACESIZE)) + + /* + * Indicates whether this compression proceeds directly from user-provided +@@ -345,11 +457,11 @@ typedef enum { + */ + #define ZSTD_MAX_NB_BLOCK_SPLITS 196 + typedef struct { +- seqStore_t fullSeqStoreChunk; +- seqStore_t firstHalfSeqStore; +- seqStore_t secondHalfSeqStore; +- seqStore_t currSeqStore; +- seqStore_t nextSeqStore; ++ SeqStore_t fullSeqStoreChunk; ++ SeqStore_t firstHalfSeqStore; ++ SeqStore_t secondHalfSeqStore; ++ SeqStore_t currSeqStore; ++ SeqStore_t nextSeqStore; + + U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS]; + ZSTD_entropyCTablesMetadata_t entropyMetadata; +@@ -366,7 +478,7 @@ struct ZSTD_CCtx_s { + size_t dictContentSize; + + ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */ +- size_t blockSize; ++ size_t blockSizeMax; + unsigned long long pledgedSrcSizePlusOne; /* this way, 0 (default) == unknown */ + unsigned long long consumedSrcSize; + unsigned long long producedCSize; +@@ -378,13 +490,14 @@ struct ZSTD_CCtx_s { + int isFirstBlock; + int initialized; + +- seqStore_t seqStore; /* sequences storage ptrs */ ++ SeqStore_t seqStore; /* sequences storage ptrs */ + ldmState_t ldmState; /* long distance matching state */ + rawSeq* ldmSequences; /* Storage for the ldm output sequences */ + size_t maxNbLdmSequences; +- rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */ ++ RawSeqStore_t externSeqStore; /* Mutable reference to external sequences */ + ZSTD_blockState_t blockState; +- U32* entropyWorkspace; /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */ ++ void* tmpWorkspace; /* used as substitute of stack space - must be aligned for S64 type */ ++ size_t tmpWkspSize; + + /* Whether we are streaming or not */ + ZSTD_buffered_policy_e bufferedPolicy; +@@ -404,6 +517,7 @@ struct ZSTD_CCtx_s { /* Stable in/out buffer verification */ ZSTD_inBuffer expectedInBuffer; @@ -8968,7 +11794,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t expectedOutBufferSize; /* Dictionary */ -@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s { +@@ -417,9 +531,14 @@ struct ZSTD_CCtx_s { /* Workspace for block splitter */ ZSTD_blockSplitCtx blockSplitCtx; @@ -8983,7 +11809,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> typedef enum { ZSTD_noDict = 0, -@@ -441,7 +490,7 @@ typedef enum { +@@ -441,17 +560,17 @@ typedef enum { * In this mode we take both the source size and the dictionary size * into account when selecting and adjusting the parameters. */ @@ -8992,7 +11818,43 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * We don't know what these parameters are for. We default to the legacy * behavior of taking both the source size and the dict size into account * when selecting and adjusting parameters. -@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds( + */ +-} ZSTD_cParamMode_e; ++} ZSTD_CParamMode_e; + +-typedef size_t (*ZSTD_blockCompressor) ( +- ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++typedef size_t (*ZSTD_BlockCompressor_f) ( ++ ZSTD_MatchState_t* bs, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); ++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); + + + MEM_STATIC U32 ZSTD_LLcode(U32 litLength) +@@ -497,12 +616,33 @@ MEM_STATIC int ZSTD_cParam_withinBounds( + return 1; + } + ++/* ZSTD_selectAddr: ++ * @return index >= lowLimit ? candidate : backup, ++ * tries to force branchless codegen. */ ++MEM_STATIC const BYTE* ++ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* candidate, const BYTE* backup) ++{ ++#if defined(__x86_64__) ++ __asm__ ( ++ "cmp %1, %2\n" ++ "cmova %3, %0\n" ++ : "+r"(candidate) ++ : "r"(index), "r"(lowLimit), "r"(backup) ++ ); ++ return candidate; ++#else ++ return index >= lowLimit ? candidate : backup; ++#endif ++} ++ /* ZSTD_noCompressBlock() : * Writes uncompressed block to dst buffer from given src. * Returns the size of the block */ @@ -9005,7 +11867,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, dstSize_tooSmall, "dst buf too small for uncompressed block"); MEM_writeLE24(dst, cBlockHeader24); -@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock ( +@@ -510,7 +650,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock ( return ZSTD_blockHeaderSize + srcSize; } @@ -9015,7 +11877,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { BYTE* const op = (BYTE*)dst; U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); -@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t sr +@@ -529,7 +670,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t sr { U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); @@ -9024,7 +11886,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return (srcSize >> minlog) + 2; } -@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE con +@@ -565,29 +706,68 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE con while (ip < iend) *op++ = *ip++; } @@ -9050,25 +11912,67 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) +#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) +#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ ++ ++/*! ZSTD_storeSeqOnly() : ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t. ++ * Literals themselves are not copied, but @litPtr is updated. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). ++ * @matchLength : must be >= MINMATCH ++*/ ++HINT_INLINE UNUSED_ATTR void ++ZSTD_storeSeqOnly(SeqStore_t* seqStorePtr, ++ size_t litLength, ++ U32 offBase, ++ size_t matchLength) ++{ ++ assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); ++ ++ /* literal Length */ ++ assert(litLength <= ZSTD_BLOCKSIZE_MAX); ++ if (UNLIKELY(litLength>0xFFFF)) { ++ assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ ++ seqStorePtr->longLengthType = ZSTD_llt_literalLength; ++ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ } ++ seqStorePtr->sequences[0].litLength = (U16)litLength; ++ ++ /* match offset */ ++ seqStorePtr->sequences[0].offBase = offBase; ++ ++ /* match Length */ ++ assert(matchLength <= ZSTD_BLOCKSIZE_MAX); ++ assert(matchLength >= MINMATCH); ++ { size_t const mlBase = matchLength - MINMATCH; ++ if (UNLIKELY(mlBase>0xFFFF)) { ++ assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ ++ seqStorePtr->longLengthType = ZSTD_llt_matchLength; ++ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ } ++ seqStorePtr->sequences[0].mlBase = (U16)mlBase; ++ } ++ ++ seqStorePtr->sequences++; ++} /*! ZSTD_storeSeq() : - * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. - * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). -+ * Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t. ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t. + * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). * @matchLength : must be >= MINMATCH - * Allowed to overread literals up to litLimit. + * Allowed to over-read literals up to litLimit. */ HINT_INLINE UNUSED_ATTR void - ZSTD_storeSeq(seqStore_t* seqStorePtr, +-ZSTD_storeSeq(seqStore_t* seqStorePtr, ++ZSTD_storeSeq(SeqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, - U32 offBase_minus1, + U32 offBase, size_t matchLength) { BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; -@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, +@@ -596,8 +776,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, static const BYTE* g_start = NULL; if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ { U32 const pos = (U32)((const BYTE*)literals - g_start); @@ -9079,7 +11983,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } #endif assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); -@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, +@@ -607,9 +787,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, assert(literals + litLength <= litLimit); if (litEnd <= litLimit_w) { /* Common case we can use wildcopy. @@ -9092,16 +11996,35 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_copy16(seqStorePtr->lit, literals); if (litLength > 16) { ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); -@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, - seqStorePtr->sequences[0].litLength = (U16)litLength; +@@ -619,44 +799,22 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + } + seqStorePtr->lit += litLength; - /* match offset */ +- /* literal Length */ +- if (litLength>0xFFFF) { +- assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ +- seqStorePtr->longLengthType = ZSTD_llt_literalLength; +- seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); +- } +- seqStorePtr->sequences[0].litLength = (U16)litLength; +- +- /* match offset */ - seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); -+ seqStorePtr->sequences[0].offBase = offBase; - - /* match Length */ - assert(matchLength >= MINMATCH); -@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, +- +- /* match Length */ +- assert(matchLength >= MINMATCH); +- { size_t const mlBase = matchLength - MINMATCH; +- if (mlBase>0xFFFF) { +- assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ +- seqStorePtr->longLengthType = ZSTD_llt_matchLength; +- seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); +- } +- seqStorePtr->sequences[0].mlBase = (U16)mlBase; +- } +- +- seqStorePtr->sequences++; ++ ZSTD_storeSeqOnly(seqStorePtr, litLength, offBase, matchLength); + } /* ZSTD_updateRep() : * updates in-place @rep (array of repeat offsets) @@ -9124,21 +12047,27 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (repCode > 0) { /* note : if repCode==0, no change */ U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; rep[2] = (repCode >= 2) ? rep[1] : rep[2]; -@@ -673,11 +723,11 @@ typedef struct repcodes_s { - } repcodes_t; +@@ -670,14 +828,14 @@ ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U3 - MEM_STATIC repcodes_t + typedef struct repcodes_s { + U32 rep[3]; +-} repcodes_t; ++} Repcodes_t; + +-MEM_STATIC repcodes_t -ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++MEM_STATIC Repcodes_t +ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) { - repcodes_t newReps; +- repcodes_t newReps; ++ Repcodes_t newReps; ZSTD_memcpy(&newReps, rep, sizeof(newReps)); - ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); + ZSTD_updateRep(newReps.rep, offBase, ll0); return newReps; } -@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], +@@ -685,59 +843,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], /*-************************************* * Match length counter ***************************************/ @@ -9198,7 +12127,18 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) { const BYTE* const pStart = pIn; -@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, con +@@ -771,8 +876,8 @@ ZSTD_count_2segments(const BYTE* ip, con + size_t const matchLength = ZSTD_count(ip, match, vEnd); + if (match + matchLength != mEnd) return matchLength; + DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength); +- DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match); +- DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip); ++ DEBUGLOG(7, "distance from match beginning to end dictionary = %i", (int)(mEnd - match)); ++ DEBUGLOG(7, "distance from current pos to end buffer = %i", (int)(iEnd - ip)); + DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart); + DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd)); + return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd); +@@ -783,32 +888,43 @@ ZSTD_count_2segments(const BYTE* ip, con * Hashes ***************************************/ static const U32 prime3bytes = 506832829U; @@ -9254,7 +12194,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> switch(mls) { default: -@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 h +@@ -820,6 +936,24 @@ size_t ZSTD_hashPtr(const void* p, U32 h } } @@ -9279,7 +12219,34 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* ZSTD_ipow() : * Return base^exponent. */ -@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowC +@@ -881,11 +1015,12 @@ MEM_STATIC U64 ZSTD_rollingHash_rotate(U + /*-************************************* + * Round buffer management + ***************************************/ +-#if (ZSTD_WINDOWLOG_MAX_64 > 31) +-# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX" +-#endif +-/* Max current allowed */ +-#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX)) ++/* Max @current value allowed: ++ * In 32-bit mode: we want to avoid crossing the 2 GB limit, ++ * reducing risks of side effects in case of signed operations on indexes. ++ * In 64-bit mode: we want to ensure that adding the maximum job size (512 MB) ++ * doesn't overflow U32 index capacity (4 GB) */ ++#define ZSTD_CURRENT_MAX (MEM_64bits() ? 3500U MB : 2000U MB) + /* Maximum chunk size before overflow correction needs to be called again */ + #define ZSTD_CHUNKSIZE_MAX \ + ( ((U32)-1) /* Maximum ending current index */ \ +@@ -925,7 +1060,7 @@ MEM_STATIC U32 ZSTD_window_hasExtDict(ZS + * Inspects the provided matchState and figures out what dictMode should be + * passed to the compressor. + */ +-MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms) ++MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_MatchState_t *ms) + { + return ZSTD_window_hasExtDict(ms->window) ? + ZSTD_extDict : +@@ -1011,7 +1146,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowC * The least significant cycleLog bits of the indices must remain the same, * which may be 0. Every index up to maxDist in the past must be valid. */ @@ -9290,7 +12257,25 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32 maxDist, void const* src) { /* preemptive overflow correction: -@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window +@@ -1112,7 +1249,7 @@ ZSTD_window_enforceMaxDist(ZSTD_window_t + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, +- const ZSTD_matchState_t** dictMatchStatePtr) ++ const ZSTD_MatchState_t** dictMatchStatePtr) + { + U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base); + U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0; +@@ -1157,7 +1294,7 @@ ZSTD_checkDictValidity(const ZSTD_window + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, +- const ZSTD_matchState_t** dictMatchStatePtr) ++ const ZSTD_MatchState_t** dictMatchStatePtr) + { + assert(loadedDictEndPtr != NULL); + assert(dictMatchStatePtr != NULL); +@@ -1167,10 +1304,15 @@ ZSTD_checkDictValidity(const ZSTD_window (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); assert(blockEndIdx >= loadedDictEnd); @@ -9307,23 +12292,71 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); *loadedDictEndPtr = 0; -@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_wi +@@ -1199,9 +1341,11 @@ MEM_STATIC void ZSTD_window_init(ZSTD_wi * forget about the extDict. Handles overlap of the prefix and extDict. * Returns non-zero if the segment is contiguous. */ -MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, +- void const* src, size_t srcSize, +- int forceNonContiguous) +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_window_update(ZSTD_window_t* window, - void const* src, size_t srcSize, - int forceNonContiguous) ++ const void* src, size_t srcSize, ++ int forceNonContiguous) { -@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U3 + BYTE const* const ip = (BYTE const*)src; + U32 contiguous = 1; +@@ -1228,8 +1372,9 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_w + /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */ + if ( (ip+srcSize > window->dictBase + window->lowLimit) + & (ip < window->dictBase + window->dictLimit)) { +- ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase; +- U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; ++ size_t const highInputIdx = (size_t)((ip + srcSize) - window->dictBase); ++ U32 const lowLimitMax = (highInputIdx > (size_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; ++ assert(highInputIdx < UINT_MAX); + window->lowLimit = lowLimitMax; + DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit); + } +@@ -1239,7 +1384,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_w + /* + * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix. + */ +-MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) ++MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog) + { + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.lowLimit; +@@ -1256,7 +1401,7 @@ MEM_STATIC U32 ZSTD_getLowestMatchIndex( + /* + * Returns the lowest allowed match index in the prefix. + */ +-MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) ++MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog) + { + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.dictLimit; +@@ -1269,6 +1414,13 @@ MEM_STATIC U32 ZSTD_getLowestPrefixIndex + return matchLowest; + } + ++/* index_safety_check: ++ * intentional underflow : ensure repIndex isn't overlapping dict + prefix ++ * @return 1 if values are not overlapping, ++ * 0 otherwise */ ++MEM_STATIC int ZSTD_index_overlap_check(const U32 prefixLowestIndex, const U32 repIndex) { ++ return ((U32)((prefixLowestIndex-1) - repIndex) >= 3); ++} + + + /* debug functions */ +@@ -1302,7 +1454,42 @@ MEM_STATIC void ZSTD_debugTable(const U3 #endif +/* Short Cache */ -+ + +/* Normally, zstd matchfinders follow this flow: + * 1. Compute hash at ip + * 2. Load index from hashTable[hash] @@ -9359,9 +12392,53 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + return tag1 == tag2; +} - /* =============================================================== -@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* ds + * Shared internal declarations +@@ -1319,6 +1506,25 @@ size_t ZSTD_loadCEntropy(ZSTD_compressed + + void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); + ++typedef struct { ++ U32 idx; /* Index in array of ZSTD_Sequence */ ++ U32 posInSequence; /* Position within sequence at idx */ ++ size_t posInSrc; /* Number of bytes given by sequences provided so far */ ++} ZSTD_SequencePosition; ++ ++/* for benchmark */ ++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, ++ const ZSTD_Sequence* const inSeqs, size_t nbSequences, ++ int const repcodeResolution); ++ ++typedef struct { ++ size_t nbSequences; ++ size_t blockSize; ++ size_t litSize; ++} BlockSummary; ++ ++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs); ++ + /* ============================================================== + * Private declarations + * These prototypes shall only be called from within lib/compress +@@ -1330,7 +1536,7 @@ void ZSTD_reset_compressedBlockState(ZST + * Note: srcSizeHint == 0 means 0! + */ + ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( +- const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); ++ const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); + + /*! ZSTD_initCStream_internal() : + * Private use only. Init streaming operation. +@@ -1342,7 +1548,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CS + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize); + +-void ZSTD_resetSeqStore(seqStore_t* ssPtr); ++void ZSTD_resetSeqStore(SeqStore_t* ssPtr); + + /*! ZSTD_getCParamsFromCDict() : + * as the name implies */ +@@ -1381,11 +1587,10 @@ size_t ZSTD_writeLastEmptyBlock(void* ds * This cannot be used when long range matching is enabled. * Zstd will use these sequences, and pass the literals to a secondary block * compressor. @@ -9374,37 +12451,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* ZSTD_cycleLog() : * condition for correct operation : hashLog > 1 */ -@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_stra +@@ -1396,4 +1601,28 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_stra */ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); -+/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of -+ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. -+ * Note that the block delimiter must include the last literals of the block. -+ */ -+size_t -+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, -+ ZSTD_sequencePosition* seqPos, -+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); -+ -+/* Returns the number of bytes to move the current read position back by. -+ * Only non-zero if we ended up splitting a sequence. -+ * Otherwise, it may return a ZSTD error if something went wrong. -+ * -+ * This function will attempt to scan through blockSize bytes -+ * represented by the sequences in @inSeqs, -+ * storing any (partial) sequences. -+ * -+ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to -+ * avoid splitting a match, or to avoid splitting a match such that it would produce a match -+ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. -+ */ -+size_t -+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, -+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); -+ +/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */ +MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) { + return params->extSeqProdFunc != NULL; @@ -9562,7 +12612,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); BYTE* const ostart = (BYTE*)dst; U32 singleStream = srcSize < 256; - symbolEncodingType_e hType = set_compressed; +- symbolEncodingType_e hType = set_compressed; ++ SymbolEncodingType_e hType = set_compressed; size_t cLitSize; - DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", @@ -9731,7 +12782,22 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> */ return nbSeq >= 2048; } -@@ -166,7 +167,7 @@ ZSTD_selectEncodingType( +@@ -153,20 +154,20 @@ size_t ZSTD_crossEntropyCost(short const + return cost >> 8; + } + +-symbolEncodingType_e ++SymbolEncodingType_e + ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, +- ZSTD_defaultPolicy_e const isDefaultAllowed, ++ ZSTD_DefaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy) + { + ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0); if (mostFrequent == nbSeq) { *repeatMode = FSE_repeat_none; if (isDefaultAllowed && nbSeq <= 2) { @@ -9740,6 +12806,51 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. * If basic encoding isn't possible, always choose RLE. */ +@@ -241,7 +242,7 @@ typedef struct { + + size_t + ZSTD_buildCTable(void* dst, size_t dstCapacity, +- FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, ++ FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, +@@ -293,7 +294,7 @@ ZSTD_encodeSequences_body( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets) + { + BIT_CStream_t blockStream; + FSE_CState_t stateMatchLength; +@@ -387,7 +388,7 @@ ZSTD_encodeSequences_default( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets) + { + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, +@@ -405,7 +406,7 @@ ZSTD_encodeSequences_bmi2( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets) + { + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, +@@ -421,7 +422,7 @@ size_t ZSTD_encodeSequences( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) + { + DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity); + #if DYNAMIC_BMI2 --- a/lib/zstd/compress/zstd_compress_sequences.h +++ b/lib/zstd/compress/zstd_compress_sequences.h @@ -1,5 +1,6 @@ @@ -9750,6 +12861,48 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the +@@ -11,26 +12,27 @@ + #ifndef ZSTD_COMPRESS_SEQUENCES_H + #define ZSTD_COMPRESS_SEQUENCES_H + ++#include "zstd_compress_internal.h" /* SeqDef */ + #include "../common/fse.h" /* FSE_repeat, FSE_CTable */ +-#include "../common/zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */ ++#include "../common/zstd_internal.h" /* SymbolEncodingType_e, ZSTD_strategy */ + + typedef enum { + ZSTD_defaultDisallowed = 0, + ZSTD_defaultAllowed = 1 +-} ZSTD_defaultPolicy_e; ++} ZSTD_DefaultPolicy_e; + +-symbolEncodingType_e ++SymbolEncodingType_e + ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, +- ZSTD_defaultPolicy_e const isDefaultAllowed, ++ ZSTD_DefaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy); + + size_t + ZSTD_buildCTable(void* dst, size_t dstCapacity, +- FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, ++ FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, +@@ -42,7 +44,7 @@ size_t ZSTD_encodeSequences( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); ++ SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); + + size_t ZSTD_fseBitCost( + FSE_CTable const* ctable, --- a/lib/zstd/compress/zstd_compress_superblock.c +++ b/lib/zstd/compress/zstd_compress_superblock.c @@ -1,5 +1,6 @@ @@ -9781,8 +12934,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { size_t const header = writeEntropy ? 200 : 0; size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); -@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_lite - symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; +@@ -50,11 +52,9 @@ static size_t ZSTD_compressSubBlock_lite + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart + lhSize; + U32 const singleStream = lhSize == 3; +- symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; ++ SymbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; size_t cLitSize = 0; - (void)bmi2; /* TODO bmi2... */ @@ -9825,8 +12982,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - const seqDef* const send = sequences + nbSeq; - const seqDef* sp = sstart; +static size_t -+ZSTD_seqDecompressedSize(seqStore_t const* seqStore, -+ const seqDef* sequences, size_t nbSeqs, ++ZSTD_seqDecompressedSize(SeqStore_t const* seqStore, ++ const SeqDef* sequences, size_t nbSeqs, + size_t litSize, int lastSubBlock) +{ size_t matchLengthSum = 0; @@ -9836,7 +12993,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); + size_t n; + for (n=0; n<nbSeqs; n++) { -+ const ZSTD_sequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n); ++ const ZSTD_SequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n); litLengthSum += seqLen.litLength; matchLengthSum += seqLen.matchLength; - sp++; @@ -9869,7 +13026,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +static size_t +ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, -+ const seqDef* sequences, size_t nbSeq, ++ const SeqDef* sequences, size_t nbSeq, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, @@ -9912,6 +13069,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* ZSTD_compressSubBlock() : +@@ -258,7 +263,7 @@ static size_t ZSTD_compressSubBlock_sequ + * Or 0 if it failed to compress. */ + static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- const seqDef* sequences, size_t nbSeq, ++ const SeqDef* sequences, size_t nbSeq, + const BYTE* literals, size_t litSize, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, @@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(cons litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, @@ -9944,6 +13110,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, +@@ -322,7 +328,7 @@ static size_t ZSTD_estimateSubBlockSize_ + return 0; + } + +-static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type, ++static size_t ZSTD_estimateSubBlockSize_symbolType(SymbolEncodingType_e type, + const BYTE* codeTable, unsigned maxCode, + size_t nbSeq, const FSE_CTable* fseCTable, + const U8* additionalBits, @@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_ return cSeqSizeEstimate + sequencesSectionHeaderSize; } @@ -9982,11 +13157,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) -@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTable +@@ -415,14 +427,57 @@ static int ZSTD_needSequenceEntropyTable return 0; } -+static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount) ++static size_t countLiterals(SeqStore_t const* seqStore, const SeqDef* sp, size_t seqCount) +{ + size_t n, total = 0; + assert(sp != NULL); @@ -9999,7 +13174,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + +#define BYTESCALE 256 + -+static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs, ++static size_t sizeBlockSequences(const SeqDef* sp, size_t nbSeqs, + size_t targetBudget, size_t avgLitCost, size_t avgSeqCost, + int firstSubBlock) +{ @@ -10036,20 +13211,26 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - * All sub-blocks are compressed blocks (no raw or rle blocks). - * @return : compressed size of the super block (which is multiple ZSTD blocks) - * Or 0 if it failed to compress. */ +-static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + * Entropy will be written into the first block. + * The following blocks use repeat_mode to compress. + * Sub-blocks are all compressed, except the last one when beneficial. + * @return : compressed size of the super block (which features multiple ZSTD blocks) + * or 0 if it failed to compress. */ - static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, ++static size_t ZSTD_compressSubBlock_multi(const SeqStore_t* seqStorePtr, const ZSTD_compressedBlockState_t* prevCBlock, ZSTD_compressedBlockState_t* nextCBlock, -@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_mult + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +@@ -432,12 +487,14 @@ static size_t ZSTD_compressSubBlock_mult + const int bmi2, U32 lastBlock, + void* workspace, size_t wkspSize) { - const seqDef* const sstart = seqStorePtr->sequencesStart; - const seqDef* const send = seqStorePtr->sequences; +- const seqDef* const sstart = seqStorePtr->sequencesStart; +- const seqDef* const send = seqStorePtr->sequences; - const seqDef* sp = sstart; -+ const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ ++ const SeqDef* const sstart = seqStorePtr->sequencesStart; ++ const SeqDef* const send = seqStorePtr->sequences; ++ const SeqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ + size_t const nbSeqs = (size_t)(send - sstart); const BYTE* const lstart = seqStorePtr->litStart; const BYTE* const lend = seqStorePtr->lit; @@ -10277,8 +13458,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* We have to regenerate the repcodes because we've skipped some sequences */ if (sp < send) { - seqDef const* seq; -+ const seqDef* seq; - repcodes_t rep; +- repcodes_t rep; ++ const SeqDef* seq; ++ Repcodes_t rep; ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); for (seq = sstart; seq < sp; ++seq) { - ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); @@ -10305,6 +13487,22 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_entropyCTablesMetadata_t entropyMetadata; FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, +@@ -559,7 +675,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + &entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */), ""); + + return ZSTD_compressSubBlock_multi(&zc->seqStore, + zc->blockState.prevCBlock, +@@ -569,5 +685,5 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx + dst, dstCapacity, + src, srcSize, + zc->bmi2, lastBlock, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */); ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */); + } --- a/lib/zstd/compress/zstd_compress_superblock.h +++ b/lib/zstd/compress/zstd_compress_superblock.h @@ -1,5 +1,6 @@ @@ -10325,16 +13523,18 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -14,7 +15,9 @@ +@@ -14,8 +15,10 @@ /*-************************************* * Dependencies ***************************************/ +#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ #include "../common/zstd_internal.h" +- +#include "../common/portability_macros.h" - ++#include "../common/compiler.h" /* ZS2_isPower2 */ /*-************************************* + * Constants @@ -41,8 +44,9 @@ ***************************************/ typedef enum { @@ -10408,7 +13608,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { (void)ws; -@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_intern +@@ -168,14 +184,16 @@ MEM_STATIC void ZSTD_cwksp_assert_intern assert(ws->tableEnd <= ws->allocStart); assert(ws->tableValidEnd <= ws->allocStart); assert(ws->allocStart <= ws->workspaceEnd); @@ -10417,7 +13617,45 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* -@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_all + * Align must be a power of 2. + */ +-MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { ++MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t align) { + size_t const mask = align - 1; +- assert((align & mask) == 0); ++ assert(ZSTD_isPower2(align)); + return (size + mask) & ~mask; + } + +@@ -189,7 +207,7 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_ + * to figure out how much space you need for the matchState tables. Everything + * else is though. + * +- * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size(). ++ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned64_alloc_size(). + */ + MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { + if (size == 0) +@@ -197,12 +215,16 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size( + return size; + } + ++MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size, size_t alignment) { ++ return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, alignment)); ++} ++ + /* + * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes. + * Used to determine the number of bytes required for a given "aligned". + */ +-MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { +- return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES)); ++MEM_STATIC size_t ZSTD_cwksp_aligned64_alloc_size(size_t size) { ++ return ZSTD_cwksp_aligned_alloc_size(size, ZSTD_CWKSP_ALIGNMENT_BYTES); + } + + /* +@@ -210,14 +232,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_all * for internal purposes (currently only alignment). */ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { @@ -10435,11 +13673,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return slackSpace; } -@@ -230,11 +244,19 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_al +@@ -229,12 +247,24 @@ MEM_STATIC size_t ZSTD_cwksp_slack_space + MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) { size_t const alignBytesMask = alignBytes - 1; size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; - assert((alignBytes & alignBytesMask) == 0); +- assert((alignBytes & alignBytesMask) == 0); - assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); ++ assert(ZSTD_isPower2(alignBytes)); + assert(bytes < alignBytes); return bytes; } @@ -10448,15 +13688,28 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * Returns the initial value for allocStart which is used to determine the position from + * which we can allocate from the end of the workspace. + */ -+MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) { -+ return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1)); ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) ++{ ++ char* endPtr = (char*)ws->workspaceEnd; ++ assert(ZSTD_isPower2(ZSTD_CWKSP_ALIGNMENT_BYTES)); ++ endPtr = endPtr - ((size_t)endPtr % ZSTD_CWKSP_ALIGNMENT_BYTES); ++ return (void*)endPtr; +} + +/* * Internal function. Do not use directly. * Reserves the given number of bytes within the aligned/buffer segment of the wksp, * which counts from the end of the wksp (as opposed to the object/table segment). -@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c +@@ -246,7 +276,7 @@ ZSTD_cwksp_reserve_internal_buffer_space + { + void* const alloc = (BYTE*)ws->allocStart - bytes; + void* const bottom = ws->tableEnd; +- DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", ++ DEBUGLOG(5, "cwksp: reserving [0x%p]:%zd bytes; %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(alloc >= bottom); +@@ -274,27 +304,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c { assert(phase >= ws->phase); if (phase > ws->phase) { @@ -10490,7 +13743,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, "table phase - alignment initial allocation failed!"); -@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c +@@ -302,7 +321,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c ws->tableEnd = objectEnd; /* table area starts being empty */ if (ws->tableValidEnd < ws->tableEnd) { ws->tableValidEnd = ws->tableEnd; @@ -10501,7 +13754,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ws->phase = phase; ZSTD_cwksp_assert_internal_consistency(ws); } -@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c +@@ -314,7 +335,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_c */ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { @@ -10510,7 +13763,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* -@@ -345,6 +358,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buff +@@ -345,29 +366,61 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buff /* * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). @@ -10520,12 +13773,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + * The main usage is for algorithms that might need read access into uninitialized memory. + * The algorithm must maintain safety under these conditions and must make sure it doesn't + * leak any of the past data (directly or in side channels). -+ */ + */ +-MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) +MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) -+{ + { +- void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), +- ZSTD_cwksp_alloc_aligned); +- assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); + size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); + void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); -+ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + if(ptr && ptr < ws->initOnceStart) { + /* We assume the memory following the current allocation is either: + * 1. Not usable as initOnce memory (end of workspace) @@ -10541,10 +13798,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + +/* + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). - */ - MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) - { -@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_alig ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned64(ZSTD_cwksp* ws, size_t bytes) ++{ ++ void* const ptr = ZSTD_cwksp_reserve_internal(ws, ++ ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), ++ ZSTD_cwksp_alloc_aligned); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + return ptr; + } /* * Aligned on 64 bytes. These buffers have the special property that @@ -10571,7 +13833,37 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } alloc = ws->tableEnd; end = (BYTE *)alloc + bytes; -@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables( +@@ -387,7 +440,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_tabl + + + assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); +- assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + return alloc; + } + +@@ -421,6 +474,20 @@ MEM_STATIC void* ZSTD_cwksp_reserve_obje + + return alloc; + } ++/* ++ * with alignment control ++ * Note : should happen only once, at workspace first initialization ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_object_aligned(ZSTD_cwksp* ws, size_t byteSize, size_t alignment) ++{ ++ size_t const mask = alignment - 1; ++ size_t const surplus = (alignment > sizeof(void*)) ? alignment - sizeof(void*) : 0; ++ void* const start = ZSTD_cwksp_reserve_object(ws, byteSize + surplus); ++ if (start == NULL) return NULL; ++ if (surplus == 0) return start; ++ assert(ZSTD_isPower2(alignment)); ++ return (void*)(((size_t)start + surplus) & ~mask); ++} + + MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) + { +@@ -451,7 +518,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables( assert(ws->tableValidEnd >= ws->objectEnd); assert(ws->tableValidEnd <= ws->allocStart); if (ws->tableValidEnd < ws->tableEnd) { @@ -10580,7 +13872,17 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } ZSTD_cwksp_mark_tables_clean(ws); } -@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cw +@@ -460,7 +527,8 @@ MEM_STATIC void ZSTD_cwksp_clean_tables( + * Invalidates table allocations. + * All other allocations remain valid. + */ +-MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) { ++MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) ++{ + DEBUGLOG(4, "cwksp: clearing tables!"); + + +@@ -478,14 +546,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cw ws->tableEnd = ws->objectEnd; @@ -10607,7 +13909,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* * The provided workspace takes ownership of the buffer [start, start+size). * Any existing values in the workspace are ignored (the previously managed -@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwk +@@ -498,6 +575,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwk ws->workspaceEnd = (BYTE*)start + size; ws->objectEnd = ws->workspace; ws->tableValidEnd = ws->objectEnd; @@ -10615,7 +13917,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ws->phase = ZSTD_cwksp_alloc_objects; ws->isStatic = isStatic; ZSTD_cwksp_clear(ws); -@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwk +@@ -529,15 +607,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwk ZSTD_memset(src, 0, sizeof(ZSTD_cwksp)); } @@ -10631,7 +13933,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { return ws->allocFailed; } -@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed +@@ -550,17 +619,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed * Returns if the estimated space needed for a wksp is within an acceptable limit of the * actual amount of space used. */ @@ -10654,6 +13956,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } +@@ -591,5 +654,4 @@ MEM_STATIC void ZSTD_cwksp_bump_oversize + } + } + +- + #endif /* ZSTD_CWKSP_H */ --- a/lib/zstd/compress/zstd_double_fast.c +++ b/lib/zstd/compress/zstd_double_fast.c @@ -1,5 +1,6 @@ @@ -10673,7 +13981,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, ++void ZSTD_fillDoubleHashTableForCDict(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; @@ -10711,11 +14019,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms, ++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_MatchState_t* ms, void const* end, ZSTD_dictTableLoadMethod_e dtlm) { const ZSTD_compressionParameters* const cParams = &ms->cParams; -@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_match +@@ -43,13 +85,26 @@ void ZSTD_fillDoubleHashTable(ZSTD_match /* Only load extra positions for ZSTD_dtlm_full */ if (dtlm == ZSTD_dtlm_fast) break; @@ -10723,7 +14031,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + } } +} + -+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp) @@ -10739,8 +14047,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_doubleFast_noDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls /* template */) + { + ZSTD_compressionParameters const* cParams = &ms->cParams; @@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noD const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - HASH_READ_SIZE; @@ -10750,7 +14061,22 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t mLength; U32 offset; -@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noD +@@ -88,9 +143,14 @@ size_t ZSTD_compressBlock_doubleFast_noD + const BYTE* matchl0; /* the long match for ip */ + const BYTE* matchs0; /* the short match for ip */ + const BYTE* matchl1; /* the long match for ip1 */ ++ const BYTE* matchs0_safe; /* matchs0 or safe address */ + + const BYTE* ip = istart; /* the current position */ + const BYTE* ip1; /* the next position */ ++ /* Array of ~random data, should have low probability of matching data ++ * we load from here instead of from tables, if matchl0/matchl1 are ++ * invalid indices. Used to avoid unpredictable branches. */ ++ const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4}; + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); + +@@ -100,8 +160,8 @@ size_t ZSTD_compressBlock_doubleFast_noD U32 const current = (U32)(ip - base); U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); U32 const maxRep = current - windowLow; @@ -10761,7 +14087,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* Outer Loop: one iteration per match found and stored */ -@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noD +@@ -131,30 +191,35 @@ size_t ZSTD_compressBlock_doubleFast_noD if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; ip++; @@ -10770,7 +14096,44 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> goto _match_stored; } -@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noD + hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); + +- if (idxl0 > prefixLowestIndex) { ++ /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch. ++ * However expression below complies into conditional move. Since ++ * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex ++ * if there is a match, all branches become predictable. */ ++ { const BYTE* const matchl0_safe = ZSTD_selectAddr(idxl0, prefixLowestIndex, matchl0, &dummy[0]); ++ + /* check prefix long match */ +- if (MEM_read64(matchl0) == MEM_read64(ip)) { ++ if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) { + mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; + offset = (U32)(ip-matchl0); + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ + goto _match_found; +- } +- } ++ } } + + idxl1 = hashLong[hl1]; + matchl1 = base + idxl1; + +- if (idxs0 > prefixLowestIndex) { +- /* check prefix short match */ +- if (MEM_read32(matchs0) == MEM_read32(ip)) { +- goto _search_next_long; +- } ++ /* Same optimization as matchl0 above */ ++ matchs0_safe = ZSTD_selectAddr(idxs0, prefixLowestIndex, matchs0, &dummy[0]); ++ ++ /* check prefix short match */ ++ if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) { ++ goto _search_next_long; + } + + if (ip1 >= nextStep) { +@@ -175,30 +240,36 @@ size_t ZSTD_compressBlock_doubleFast_noD } while (ip1 <= ilimit); _cleanup: @@ -10786,7 +14149,40 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Return the last literals size */ return (size_t)(iend - anchor); -@@ -217,7 +276,7 @@ _match_found: /* requires ip, offset, mL + + _search_next_long: + +- /* check prefix long +1 match */ +- if (idxl1 > prefixLowestIndex) { +- if (MEM_read64(matchl1) == MEM_read64(ip1)) { ++ /* short match found: let's check for a longer one */ ++ mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; ++ offset = (U32)(ip - matchs0); ++ ++ /* check long match at +1 position */ ++ if ((idxl1 > prefixLowestIndex) && (MEM_read64(matchl1) == MEM_read64(ip1))) { ++ size_t const l1len = ZSTD_count(ip1+8, matchl1+8, iend) + 8; ++ if (l1len > mLength) { ++ /* use the long match instead */ + ip = ip1; +- mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8; ++ mLength = l1len; + offset = (U32)(ip-matchl1); +- while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */ +- goto _match_found; ++ matchs0 = matchl1; + } + } + +- /* if no long +1 match, explore the short match we found */ +- mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; +- offset = (U32)(ip - matchs0); +- while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */ ++ while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* complete backward */ + + /* fall-through */ + +@@ -217,7 +288,7 @@ _match_found: /* requires ip, offset, mL hashLong[hl1] = (U32)(ip1 - base); } @@ -10795,7 +14191,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> _match_stored: /* match found */ -@@ -243,7 +302,7 @@ _match_stored: +@@ -243,7 +314,7 @@ _match_stored: U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); @@ -10804,23 +14200,29 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ip += rLength; anchor = ip; continue; /* faster when present ... (?) */ -@@ -254,6 +313,7 @@ _match_stored: +@@ -254,8 +325,9 @@ _match_stored: FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, -@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dic + U32 const mls /* template */) + { +@@ -275,9 +347,8 @@ size_t ZSTD_compressBlock_doubleFast_dic const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - HASH_READ_SIZE; U32 offset_1=rep[0], offset_2=rep[1]; - U32 offsetSaved = 0; - const ZSTD_matchState_t* const dms = ms->dictMatchState; +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; const ZSTD_compressionParameters* const dictCParams = &dms->cParams; -@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dic + const U32* const dictHashLong = dms->hashTable; + const U32* const dictHashSmall = dms->chainTable; +@@ -286,8 +357,8 @@ size_t ZSTD_compressBlock_doubleFast_dic const BYTE* const dictStart = dictBase + dictStartIndex; const BYTE* const dictEnd = dms->window.nextSrc; const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); @@ -10831,7 +14233,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); -@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dic +@@ -295,6 +366,13 @@ size_t ZSTD_compressBlock_doubleFast_dic /* if a dictionary is attached, it must be within window range */ assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); @@ -10845,7 +14247,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* init */ ip += (dictAndPrefixLength == 0); -@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dic +@@ -309,8 +387,12 @@ size_t ZSTD_compressBlock_doubleFast_dic U32 offset; size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); @@ -10860,7 +14262,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32 const curr = (U32)(ip-base); U32 const matchIndexL = hashLong[h2]; U32 matchIndexS = hashSmall[h]; -@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dic +@@ -323,26 +405,24 @@ size_t ZSTD_compressBlock_doubleFast_dic + hashLong[h2] = hashSmall[h] = curr; /* update hash tables */ + + /* check repcode */ +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; ip++; @@ -10869,11 +14277,20 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> goto _match_stored; } -@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dic - while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ - goto _match_found; - } +- if (matchIndexL > prefixLowestIndex) { ++ if ((matchIndexL >= prefixLowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + /* check prefix long match */ +- if (MEM_read64(matchLong) == MEM_read64(ip)) { +- mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; +- offset = (U32)(ip-matchLong); +- while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ +- goto _match_found; +- } - } else { ++ mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; ++ offset = (U32)(ip-matchLong); ++ while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ ++ goto _match_found; + } else if (dictTagsMatchL) { /* check dictMatchState long match */ - U32 const dictMatchIndexL = dictHashLong[dictHL]; @@ -10881,7 +14298,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const BYTE* dictMatchL = dictBase + dictMatchIndexL; assert(dictMatchL < dictEnd); -@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dic +@@ -354,13 +434,13 @@ size_t ZSTD_compressBlock_doubleFast_dic + } } + + if (matchIndexS > prefixLowestIndex) { +- /* check prefix short match */ ++ /* short match candidate */ if (MEM_read32(match) == MEM_read32(ip)) { goto _search_next_long; } @@ -10893,7 +14315,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> match = dictBase + dictMatchIndexS; matchIndexS = dictMatchIndexS + dictIndexDelta; -@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dic +@@ -375,25 +455,24 @@ size_t ZSTD_compressBlock_doubleFast_dic continue; _search_next_long: @@ -10907,11 +14329,22 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const BYTE* matchL3 = base + matchIndexL3; hashLong[hl3] = curr + 1; -@@ -391,9 +462,9 @@ _search_next_long: - while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ - goto _match_found; - } + /* check prefix long +1 match */ +- if (matchIndexL3 > prefixLowestIndex) { +- if (MEM_read64(matchL3) == MEM_read64(ip+1)) { +- mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; +- ip++; +- offset = (U32)(ip-matchL3); +- while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ +- goto _match_found; +- } - } else { ++ if ((matchIndexL3 >= prefixLowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1))) { ++ mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; ++ ip++; ++ offset = (U32)(ip-matchL3); ++ while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ ++ goto _match_found; + } else if (dictTagsMatchL3) { /* check dict long +1 match */ - U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; @@ -10919,7 +14352,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; assert(dictMatchL3 < dictEnd); if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { -@@ -419,7 +490,7 @@ _match_found: +@@ -419,7 +498,7 @@ _match_found: offset_2 = offset_1; offset_1 = offset; @@ -10928,7 +14361,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> _match_stored: /* match found */ -@@ -448,7 +519,7 @@ _match_stored: +@@ -443,12 +522,12 @@ _match_stored: + const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ? + dictBase + repIndex2 - dictIndexDelta : + base + repIndex2; +- if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) ++ if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex2)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ @@ -10937,7 +14376,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; ip += repLength2; -@@ -461,8 +532,8 @@ _match_stored: +@@ -461,8 +540,8 @@ _match_stored: } /* while (ip < ilimit) */ /* save reps for next block */ @@ -10948,18 +14387,54 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Return the last literals size */ return (size_t)(iend - anchor); -@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dic +@@ -470,7 +549,7 @@ _match_stored: + + #define ZSTD_GEN_DFAST_FN(dictMode, mls) \ + static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \ +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ +@@ -488,7 +567,7 @@ ZSTD_GEN_DFAST_FN(dictMatchState, 7) + + + size_t ZSTD_compressBlock_doubleFast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + const U32 mls = ms->cParams.minMatch; +@@ -508,7 +587,7 @@ size_t ZSTD_compressBlock_doubleFast( + + + size_t ZSTD_compressBlock_doubleFast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + const U32 mls = ms->cParams.minMatch; +@@ -527,8 +606,10 @@ size_t ZSTD_compressBlock_doubleFast_dic } -static size_t ZSTD_compressBlock_doubleFast_extDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_doubleFast_extDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls /* template */) -@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleF + { +@@ -579,13 +660,13 @@ static size_t ZSTD_compressBlock_doubleF + size_t mLength; + hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ + +- if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ ++ if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) + & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */ + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; ip++; @@ -10968,7 +14443,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } else { if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; -@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleF +@@ -596,7 +677,7 @@ static size_t ZSTD_compressBlock_doubleF while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ offset_2 = offset_1; offset_1 = offset; @@ -10977,7 +14452,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); -@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleF +@@ -621,7 +702,7 @@ static size_t ZSTD_compressBlock_doubleF } offset_2 = offset_1; offset_1 = offset; @@ -10986,7 +14461,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } else { ip += ((ip-anchor) >> kSearchStrength) + 1; -@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleF +@@ -647,13 +728,13 @@ static size_t ZSTD_compressBlock_doubleF + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ ++ if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) + & (offset_2 <= current2 - dictStartIndex)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ @@ -10995,7 +14477,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; ip += repLength2; -@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_ext +@@ -677,7 +758,7 @@ ZSTD_GEN_DFAST_FN(extDict, 6) + ZSTD_GEN_DFAST_FN(extDict, 7) + + size_t ZSTD_compressBlock_doubleFast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; +@@ -694,3 +775,5 @@ size_t ZSTD_compressBlock_doubleFast_ext return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); } } @@ -11011,24 +14502,36 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -15,8 +16,12 @@ +@@ -11,22 +12,32 @@ + #ifndef ZSTD_DOUBLE_FAST_H + #define ZSTD_DOUBLE_FAST_H + +- #include "../common/mem.h" /* U32 */ #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + - void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, -- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_doubleFast( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_ext - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + size_t ZSTD_compressBlock_doubleFast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_doubleFast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +- +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict @@ -11038,7 +14541,6 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL +#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ - #endif /* ZSTD_DOUBLE_FAST_H */ --- a/lib/zstd/compress/zstd_fast.c +++ b/lib/zstd/compress/zstd_fast.c @@ -11056,7 +14558,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, ++void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) +{ @@ -11068,8 +14570,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; - --void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ + /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. + * Feel free to remove this assert if there's a good reason! */ + assert(dtlm == ZSTD_dtlm_full); @@ -11089,12 +14590,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); + if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ + ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); -+ } } } } ++ } } } } +} -+ + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, ++void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms, const void* const end, ZSTD_dictTableLoadMethod_e dtlm) { @@ -11109,11 +14611,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Always insert every fastHashFillStep position into the hash table. * Insert the other positions if their hash entry is empty. */ -@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_ +@@ -42,6 +85,60 @@ void ZSTD_fillHashTable(ZSTD_matchState_ } } } } } -+void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp) @@ -11124,23 +14626,77 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + ZSTD_fillHashTableForCCtx(ms, end, dtlm); + } +} ++ ++ ++typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit); ++ ++static int ++ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) ++{ ++ /* Array of ~random data, should have low probability of matching data. ++ * Load from here if the index is invalid. ++ * Used to avoid unpredictable branches. */ ++ static const BYTE dummy[] = {0x12,0x34,0x56,0x78}; ++ ++ /* currentIdx >= lowLimit is a (somewhat) unpredictable branch. ++ * However expression below compiles into conditional move. ++ */ ++ const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy); ++ /* Note: this used to be written as : return test1 && test2; ++ * Unfortunately, once inlined, these tests become branches, ++ * in which case it becomes critical that they are executed in the right order (test1 then test2). ++ * So we have to write these tests in a specific manner to ensure their ordering. ++ */ ++ if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0; ++ /* force ordering of these tests, which matters once the function is inlined, as they become branches */ ++ __asm__(""); ++ return matchIdx >= idxLowLimit; ++} ++ ++static int ++ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) ++{ ++ /* using a branch instead of a cmov, ++ * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true, ++ * aka almost all candidates are within range */ ++ U32 mval; ++ if (matchIdx >= idxLowLimit) { ++ mval = MEM_read32(matchAddress); ++ } else { ++ mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */ ++ } ++ ++ return (MEM_read32(currentPtr) == mval); ++} + /* * If you squint hard enough (and ignore repcodes), the search operation at any -@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_ +@@ -89,17 +186,17 @@ void ZSTD_fillHashTable(ZSTD_matchState_ * * This is also the work we do at the beginning to enter the loop initially. */ -FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_fast_noDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_fast_noDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, - U32 const mls, U32 const hasStep) -@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic( +- U32 const mls, U32 const hasStep) ++ U32 const mls, int useCmov) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; +- /* support stepSize of 0 */ +- size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2; ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */ + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); +@@ -117,12 +214,11 @@ ZSTD_compressBlock_fast_noDict_generic( U32 rep_offset1 = rep[0]; U32 rep_offset2 = rep[1]; @@ -11149,7 +14705,20 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t hash0; /* hash for ip0 */ size_t hash1; /* hash for ip1 */ -@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic( +- U32 idx; /* match idx for ip0 */ +- U32 mval; /* src value at match idx */ ++ U32 matchIdx; /* match idx for ip0 */ + + U32 offcode; + const BYTE* match0; +@@ -135,14 +231,15 @@ ZSTD_compressBlock_fast_noDict_generic( + size_t step; + const BYTE* nextStep; + const size_t kStepIncr = (1 << (kSearchStrength - 1)); ++ const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch; + + DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); + ip0 += (ip0 == prefixStart); { U32 const curr = (U32)(ip0 - base); U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); U32 const maxRep = curr - windowLow; @@ -11160,7 +14729,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* start each op */ -@@ -180,8 +236,14 @@ _start: /* Requires: ip0 */ +@@ -163,7 +260,7 @@ _start: /* Requires: ip0 */ + hash0 = ZSTD_hashPtr(ip0, hlog, mls); + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + +- idx = hashTable[hash0]; ++ matchIdx = hashTable[hash0]; + + do { + /* load repcode match for ip[2]*/ +@@ -180,26 +277,28 @@ _start: /* Requires: ip0 */ mLength = ip0[-1] == match0[-1]; ip0 -= mLength; match0 -= mLength; @@ -11168,50 +14746,75 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + offcode = REPCODE1_TO_OFFBASE; mLength += 4; + -+ /* First write next hash table entry; we've already calculated it. -+ * This write is known to be safe because the ip1 is before the ++ /* Write next hash table entry: it's already calculated. ++ * This write is known to be safe because ip1 is before the + * repcode (ip2). */ + hashTable[hash1] = (U32)(ip1 - base); + goto _match; } -@@ -195,6 +257,12 @@ _start: /* Requires: ip0 */ - /* check match at ip[0] */ - if (MEM_read32(ip0) == mval) { - /* found a match! */ -+ -+ /* First write next hash table entry; we've already calculated it. -+ * This write is known to be safe because the ip1 == ip0 + 1, so -+ * we know we will resume searching after ip1 */ +- /* load match for ip[0] */ +- if (idx >= prefixStartIndex) { +- mval = MEM_read32(base + idx); +- } else { +- mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ +- } ++ if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { ++ /* Write next hash table entry (it's already calculated). ++ * This write is known to be safe because the ip1 == ip0 + 1, ++ * so searching will resume after ip1 */ + hashTable[hash1] = (U32)(ip1 - base); -+ + +- /* check match at ip[0] */ +- if (MEM_read32(ip0) == mval) { +- /* found a match! */ goto _offset; } -@@ -224,6 +292,21 @@ _start: /* Requires: ip0 */ - /* check match at ip[0] */ - if (MEM_read32(ip0) == mval) { - /* found a match! */ -+ -+ /* first write next hash table entry; we've already calculated it */ + /* lookup ip[1] */ +- idx = hashTable[hash1]; ++ matchIdx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; +@@ -214,21 +313,19 @@ _start: /* Requires: ip0 */ + current0 = (U32)(ip0 - base); + hashTable[hash0] = current0; + +- /* load match for ip[0] */ +- if (idx >= prefixStartIndex) { +- mval = MEM_read32(base + idx); +- } else { +- mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ +- } +- +- /* check match at ip[0] */ +- if (MEM_read32(ip0) == mval) { +- /* found a match! */ ++ if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { ++ /* Write next hash table entry, since it's already calculated */ + if (step <= 4) { -+ /* We need to avoid writing an index into the hash table >= the -+ * position at which we will pick up our searching after we've -+ * taken this match. -+ * -+ * The minimum possible match has length 4, so the earliest ip0 -+ * can be after we take this match will be the current ip0 + 4. -+ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely -+ * write this position. -+ */ ++ /* Avoid writing an index if it's >= position where search will resume. ++ * The minimum possible match has length 4, so search can resume at ip0 + 4. ++ */ + hashTable[hash1] = (U32)(ip1 - base); + } -+ goto _offset; } -@@ -254,9 +337,24 @@ _cleanup: + /* lookup ip[1] */ +- idx = hashTable[hash1]; ++ matchIdx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; +@@ -250,13 +347,28 @@ _start: /* Requires: ip0 */ + } while (ip3 < ilimit); + + _cleanup: +- /* Note that there are probably still a couple positions we could search. ++ /* Note that there are probably still a couple positions one could search. * However, it seems to be a meaningful performance hit to try to search * them. So let's not. */ @@ -11238,8 +14841,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Return the last literals size */ return (size_t)(iend - anchor); -@@ -267,7 +365,7 @@ _offset: /* Requires: ip0, idx */ - match0 = base + idx; +@@ -264,10 +376,10 @@ _cleanup: + _offset: /* Requires: ip0, idx */ + + /* Compute the offset code. */ +- match0 = base + idx; ++ match0 = base + matchIdx; rep_offset2 = rep_offset1; rep_offset1 = (U32)(ip0-match0); - offcode = STORE_OFFSET(rep_offset1); @@ -11247,7 +14854,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> mLength = 4; /* Count the backwards match length. */ -@@ -287,11 +385,6 @@ _match: /* Requires: ip0, match0, offcod +@@ -287,11 +399,6 @@ _match: /* Requires: ip0, match0, offcod ip0 += mLength; anchor = ip0; @@ -11259,7 +14866,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Fill table and check for immediate repcode. */ if (ip0 <= ilimit) { /* Fill Table */ -@@ -306,7 +399,7 @@ _match: /* Requires: ip0, match0, offcod +@@ -306,7 +413,7 @@ _match: /* Requires: ip0, match0, offcod { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); ip0 += rLength; @@ -11268,15 +14875,70 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> anchor = ip0; continue; /* faster when present (confirmed on gcc-8) ... (?) */ } } } -@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast( +@@ -314,12 +421,12 @@ _match: /* Requires: ip0, match0, offcod + goto _start; + } + +-#define ZSTD_GEN_FAST_FN(dictMode, mls, step) \ +- static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \ +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ ++#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \ ++ static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \ ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ +- return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \ ++ return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \ + } + + ZSTD_GEN_FAST_FN(noDict, 4, 1) +@@ -333,13 +440,15 @@ ZSTD_GEN_FAST_FN(noDict, 6, 0) + ZSTD_GEN_FAST_FN(noDict, 7, 0) + + size_t ZSTD_compressBlock_fast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- U32 const mls = ms->cParams.minMatch; ++ U32 const mml = ms->cParams.minMatch; ++ /* use cmov when "candidate in range" branch is likely unpredictable */ ++ int const useCmov = ms->cParams.windowLog < 19; + assert(ms->dictMatchState == NULL); +- if (ms->cParams.targetLength > 1) { +- switch(mls) ++ if (useCmov) { ++ switch(mml) + { + default: /* includes case 3 */ + case 4 : +@@ -352,7 +461,8 @@ size_t ZSTD_compressBlock_fast( + return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize); + } + } else { +- switch(mls) ++ /* use a branch instead */ ++ switch(mml) + { + default: /* includes case 3 */ + case 4 : +@@ -364,13 +474,13 @@ size_t ZSTD_compressBlock_fast( + case 7 : + return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize); + } +- + } } FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_fast_dictMatchState_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls, U32 const hasStep) -@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatch + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -380,16 +490,16 @@ size_t ZSTD_compressBlock_fast_dictMatch U32 const stepSize = cParams->targetLength + !(cParams->targetLength); const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; @@ -11291,9 +14953,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32 offset_1=rep[0], offset_2=rep[1]; - U32 offsetSaved = 0; - const ZSTD_matchState_t* const dms = ms->dictMatchState; +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; -@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatch + const U32* const dictHashTable = dms->hashTable; + const U32 dictStartIndex = dms->window.dictLimit; +@@ -397,13 +507,13 @@ size_t ZSTD_compressBlock_fast_dictMatch const BYTE* const dictStart = dictBase + dictStartIndex; const BYTE* const dictEnd = dms->window.nextSrc; const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); @@ -11310,7 +14975,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> assert(endIndex - prefixStartIndex <= maxDistance); (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ -@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatch +@@ -413,106 +523,154 @@ size_t ZSTD_compressBlock_fast_dictMatch * when translating a dict index into a local index */ assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); @@ -11389,8 +15054,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); + hashTable[hash0] = curr; /* update hash table */ + -+ if (((U32) ((prefixStartIndex - 1) - repIndex) >= -+ 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ ++ if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { + const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; @@ -11423,8 +15087,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + } + } + -+ if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) { -+ /* found a regular match */ ++ if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) { ++ /* found a regular match of size >= 4 */ + U32 const offset = (U32) (ip0 - match); + mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; + while (((ip0 > anchor) & (match > prefixStart)) @@ -11496,8 +15160,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase - dictIndexDelta + repIndex2 : base + repIndex2; - if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) +- if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) - && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { ++ if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) + && (MEM_read32(repMatch2) == MEM_read32(ip0))) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; @@ -11531,18 +15196,28 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Return the last literals size */ return (size_t)(iend - anchor); -@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatch +@@ -525,7 +683,7 @@ ZSTD_GEN_FAST_FN(dictMatchState, 6, 0) + ZSTD_GEN_FAST_FN(dictMatchState, 7, 0) + + size_t ZSTD_compressBlock_fast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; +@@ -545,19 +703,20 @@ size_t ZSTD_compressBlock_fast_dictMatch } -static size_t ZSTD_compressBlock_fast_extDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_fast_extDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls, U32 const hasStep) { -@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_ex + const ZSTD_compressionParameters* const cParams = &ms->cParams; U32* const hashTable = ms->hashTable; U32 const hlog = cParams->hashLog; /* support stepSize of 0 */ @@ -11555,7 +15230,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const BYTE* anchor = istart; const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); -@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_ex +@@ -570,6 +729,28 @@ static size_t ZSTD_compressBlock_fast_ex const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - 8; U32 offset_1=rep[0], offset_2=rep[1]; @@ -11584,7 +15259,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> (void)hasStep; /* not currently specialized on whether it's accelerated */ -@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_ex +@@ -579,75 +760,202 @@ static size_t ZSTD_compressBlock_fast_ex if (prefixStartIndex == dictStartIndex) return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); @@ -11830,7 +15505,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + while (ip0 <= ilimit) { + U32 const repIndex2 = (U32)(ip0-base) - offset_2; + const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; -+ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */ ++ if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0)) + && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; @@ -11848,7 +15523,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } ZSTD_GEN_FAST_FN(extDict, 4, 0) -@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict( +@@ -656,10 +964,11 @@ ZSTD_GEN_FAST_FN(extDict, 6, 0) + ZSTD_GEN_FAST_FN(extDict, 7, 0) + + size_t ZSTD_compressBlock_fast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { U32 const mls = ms->cParams.minMatch; @@ -11866,16 +15546,34 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -16,7 +17,8 @@ +@@ -11,21 +12,20 @@ + #ifndef ZSTD_FAST_H + #define ZSTD_FAST_H + +- + #include "../common/mem.h" /* U32 */ #include "zstd_compress_internal.h" - void ZSTD_fillHashTable(ZSTD_matchState_t* ms, +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, - void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp); size_t ZSTD_compressBlock_fast( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); + size_t ZSTD_compressBlock_fast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_fast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +- + #endif /* ZSTD_FAST_H */ --- a/lib/zstd/compress/zstd_lazy.c +++ b/lib/zstd/compress/zstd_lazy.c @@ -1,5 +1,6 @@ @@ -11908,7 +15606,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -ZSTD_updateDUBT(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_updateDUBT(ZSTD_matchState_t* ms, ++void ZSTD_updateDUBT(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend, U32 mls) { @@ -11920,22 +15618,33 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, ++void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms, U32 curr, const BYTE* inputEnd, U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode) -@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t +@@ -149,9 +160,10 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t } -static size_t -ZSTD_DUBT_findBetterDictMatch ( +- const ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_DUBT_findBetterDictMatch ( - const ZSTD_matchState_t* ms, ++ const ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iend, size_t* offsetPtr, + size_t bestLength, +@@ -159,7 +171,7 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +- const ZSTD_matchState_t * const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t * const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; + const U32 * const dictHashTable = dms->hashTable; + U32 const hashLog = dmsCParams->hashLog; @@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch ( U32 matchIndex = dictMatchIndex + dictIndexDelta; if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { @@ -11964,7 +15673,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, ++size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iend, - size_t* offsetPtr, + size_t* offBasePtr, @@ -12002,7 +15711,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } return bestLength; } -@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_ +@@ -378,24 +391,25 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_ /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */ @@ -12010,7 +15719,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, ++size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, - size_t* offsetPtr, + size_t* offBasePtr, @@ -12025,6 +15734,23 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* ********************************* + * Dedicated dict search + ***********************************/ + +-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) ++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip) + { + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); +@@ -514,7 +528,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadD + */ + FORCE_INLINE_TEMPLATE + size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, +- const ZSTD_matchState_t* const dms, ++ const ZSTD_MatchState_t* const dms, + const BYTE* const ip, const BYTE* const iLimit, + const BYTE* const prefixStart, const U32 curr, + const U32 dictLimit, const size_t ddsIdx) { @@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_sea /* save best solution */ if (currentMl > ml) { @@ -12048,17 +15774,18 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Update chains up to ip (excluded) Assumption : always within prefix (i.e. not within extDict) */ -FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( +- ZSTD_matchState_t* ms, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertAndFindFirstIndex_internal( - ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, const ZSTD_compressionParameters* const cParams, - const BYTE* ip, U32 const mls) + const BYTE* ip, U32 const mls, U32 const lazySkipping) { U32* const hashTable = ms->hashTable; const U32 hashLog = cParams->hashLog; -@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAnd +@@ -632,21 +648,25 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAnd NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; hashTable[h] = idx; idx++; @@ -12068,9 +15795,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } ms->nextToUpdate = target; -@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAnd + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; + } - U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { +-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { ++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) { const ZSTD_compressionParameters* const cParams = &ms->cParams; - return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); @@ -12080,8 +15809,20 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_HcFindBestMatch( - ZSTD_matchState_t* ms, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode) +@@ -670,7 +690,7 @@ size_t ZSTD_HcFindBestMatch( + U32 nbAttempts = 1U << cParams->searchLog; + size_t ml=4-1; + +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch + ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0; + const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch @@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch( } @@ -12215,7 +15956,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, ++void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base, U32 const rowLog, U32 const mls, U32 idx, const BYTE* const iLimit) { @@ -12262,7 +16003,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - U32 const rowMask, U32 const useCache) +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, ++void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms, + U32 updateStartIdx, U32 const updateEndIdx, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) @@ -12302,13 +16043,20 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - U32 const rowMask, U32 const useCache) +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, ++void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) { U32 idx = ms->nextToUpdate; const BYTE* const base = ms->window.base; -@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t* +@@ -965,13 +947,41 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_upda + * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary + * processing. + */ +-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { ++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) { + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + const U32 rowMask = (1u << rowLog) - 1; const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); @@ -12492,15 +16240,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } #endif -@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const +@@ -1103,29 +1124,30 @@ ZSTD_row_getMatchMask(const BYTE* const /* The high-level approach of the SIMD row based match finder is as follows: * - Figure out where to insert the new entry: - * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" - * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines -+ * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index. -+ * - The hash is salted by a value that changes on every contex reset, so when the same table is used -+ * we will avoid collisions that would otherwise slow us down by intorducing phantom matches. ++ * - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index. ++ * - The hash is salted by a value that changes on every context reset, so when the same table is used ++ * we will avoid collisions that would otherwise slow us down by introducing phantom matches. + * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines * which row to insert into. - * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can @@ -12521,9 +16269,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_RowFindBestMatch( - ZSTD_matchState_t* ms, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, -@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch( + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode, const U32 rowLog) { U32* const hashTable = ms->hashTable; @@ -12532,7 +16282,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32* const hashCache = ms->hashCache; const U32 hashLog = ms->rowHashLog; const ZSTD_compressionParameters* const cParams = &ms->cParams; -@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch( +@@ -1143,11 +1165,14 @@ size_t ZSTD_RowFindBestMatch( const U32 rowEntries = (1U << rowLog); const U32 rowMask = rowEntries - 1; const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ @@ -12543,7 +16293,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + U32 hash; /* DMS/DDS variables that may be referenced laster */ - const ZSTD_matchState_t* const dms = ms->dictMatchState; +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + + /* Initialize the following variables to satisfy static analyzer */ + size_t ddsIdx = 0; @@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch( if (dictMode == ZSTD_dictMatchState) { /* Prefetch DMS rows */ @@ -12664,19 +16418,66 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (ip+currentMl == iLimit) break; } } -@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_search +@@ -1301,7 +1341,7 @@ size_t ZSTD_RowFindBestMatch( + * ZSTD_searchMax() dispatches to the correct implementation function. + * + * TODO: The start of the search function involves loading and calculating a +- * bunch of constants from the ZSTD_matchState_t. These computations could be ++ * bunch of constants from the ZSTD_MatchState_t. These computations could be + * done in an initialization function, and saved somewhere in the match state. + * Then we could pass a pointer to the saved state instead of the match state, + * and avoid duplicate computations. +@@ -1325,7 +1365,7 @@ size_t ZSTD_RowFindBestMatch( + + #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offBasePtr) \ + { \ +@@ -1335,7 +1375,7 @@ size_t ZSTD_RowFindBestMatch( + + #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ +@@ -1345,7 +1385,7 @@ size_t ZSTD_RowFindBestMatch( + + #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ +@@ -1446,7 +1486,7 @@ typedef enum { search_hashChain=0, searc + * If a match is found its offset is stored in @p offsetPtr. + */ + FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + const BYTE* ip, + const BYTE* iend, + size_t* offsetPtr, +@@ -1472,9 +1512,10 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_search * Common parser - lazy strategy *********************************/ -FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_lazy_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_lazy_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize, -@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic( + const searchMethod_e searchMethod, const U32 depth, +@@ -1491,12 +1532,13 @@ ZSTD_compressBlock_lazy_generic( const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); @@ -12686,6 +16487,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const int isDMS = dictMode == ZSTD_dictMatchState; const int isDDS = dictMode == ZSTD_dedicatedDictSearch; + const int isDxS = isDMS || isDDS; +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0; + const BYTE* const dictBase = isDxS ? dms->window.base : NULL; + const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL; @@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic( U32 const curr = (U32)(ip - base); U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); @@ -12721,6 +16528,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const BYTE* start=ip+1; DEBUGLOG(7, "search baseline (depth 0)"); +@@ -1548,7 +1591,7 @@ ZSTD_compressBlock_lazy_generic( + && repIndex < prefixLowestIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; @@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic( } @@ -12749,7 +16565,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> continue; } -@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic( +@@ -1579,34 +1631,34 @@ ZSTD_compressBlock_lazy_generic( DEBUGLOG(7, "search depth 1"); ip ++; if ( (dictMode == ZSTD_noDict) @@ -12765,7 +16581,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; -@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 3); @@ -12790,7 +16611,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> continue; /* search a better one */ } } -@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic( +@@ -1615,34 +1667,34 @@ ZSTD_compressBlock_lazy_generic( DEBUGLOG(7, "search depth 2"); ip ++; if ( (dictMode == ZSTD_noDict) @@ -12806,7 +16627,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; -@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 4); @@ -12871,7 +16697,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* check immediate repcode */ if (isDxS) { -@@ -1686,8 +1745,8 @@ _storeSequence: +@@ -1682,12 +1741,12 @@ _storeSequence: + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase - dictIndexDelta + repIndex : + base + repIndex; +- if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */) ++ if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; @@ -12882,7 +16713,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ip += matchLength; anchor = ip; continue; -@@ -1701,166 +1760,181 @@ _storeSequence: +@@ -1701,168 +1760,183 @@ _storeSequence: && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { /* store sequence */ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; @@ -12913,9 +16744,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -size_t ZSTD_compressBlock_btlazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_greedy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); @@ -12923,8 +16755,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_lazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); @@ -12932,8 +16765,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_lazy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); @@ -12941,8 +16775,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_greedy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); @@ -12950,8 +16785,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_btlazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); @@ -12959,8 +16795,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_lazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); @@ -12969,9 +16806,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif -size_t ZSTD_compressBlock_lazy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); @@ -12979,8 +16817,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_greedy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); @@ -12989,8 +16828,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - -size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); @@ -12998,8 +16838,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); @@ -13007,8 +16848,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); @@ -13017,8 +16859,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> -/* Row-based matchfinder */ -size_t ZSTD_compressBlock_lazy2_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); @@ -13027,9 +16870,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif -size_t ZSTD_compressBlock_lazy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); @@ -13037,8 +16881,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_greedy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); @@ -13046,8 +16891,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_lazy2_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); @@ -13055,8 +16901,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_lazy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy2_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); @@ -13064,8 +16911,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_greedy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); @@ -13074,7 +16922,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); @@ -13082,9 +16931,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); @@ -13092,8 +16942,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_btlazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); @@ -13108,8 +16959,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_lazy_extDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, +- ZSTD_matchState_t* ms, seqStore_t* seqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth) @@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_g DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); @@ -13136,6 +16990,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const BYTE* start=ip+1; U32 curr = (U32)(ip-base); +@@ -1912,7 +1987,7 @@ size_t ZSTD_compressBlock_lazy_extDict_g + const U32 repIndex = (U32)(curr+1 - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ + if (MEM_read32(ip+1) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ @@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_g } } @@ -13164,7 +17027,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> continue; } -@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_g +@@ -1939,30 +2023,30 @@ size_t ZSTD_compressBlock_lazy_extDict_g ip ++; curr++; /* check repCode */ @@ -13173,7 +17036,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; -@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_g + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 3); @@ -13199,7 +17067,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> continue; /* search a better one */ } } -@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_g +@@ -1971,50 +2055,57 @@ size_t ZSTD_compressBlock_lazy_extDict_g ip ++; curr++; /* check repCode */ @@ -13208,7 +17076,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; -@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_g + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 4); @@ -13265,7 +17138,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* check immediate repcode */ while (ip <= ilimit) { -@@ -2029,8 +2120,8 @@ _storeSequence: +@@ -2023,14 +2114,14 @@ _storeSequence: + const U32 repIndex = repCurrent - offset_2; + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; @@ -13276,7 +17156,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ -@@ -2045,8 +2136,9 @@ _storeSequence: +@@ -2045,58 +2136,65 @@ _storeSequence: /* Return the last literals size */ return (size_t)(iend - anchor); } @@ -13285,15 +17165,17 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_greedy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) -@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict + { return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); } -size_t ZSTD_compressBlock_lazy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) - { @@ -13303,9 +17185,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif -size_t ZSTD_compressBlock_lazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { @@ -13314,8 +17197,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_btlazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { @@ -13325,9 +17209,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif -size_t ZSTD_compressBlock_greedy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) + { @@ -13336,8 +17221,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } -size_t ZSTD_compressBlock_lazy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy2_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) - { @@ -13347,9 +17233,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif -size_t ZSTD_compressBlock_lazy2_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { @@ -13367,60 +17254,77 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -22,98 +23,175 @@ +@@ -11,7 +12,6 @@ + #ifndef ZSTD_LAZY_H + #define ZSTD_LAZY_H + +- + #include "zstd_compress_internal.h" + + /* +@@ -22,98 +22,173 @@ */ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 +-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); +-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); +#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ + +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) - U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); - void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); ++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip); ++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip); - void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); +-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); ++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip); void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ +#endif -size_t ZSTD_compressBlock_btlazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_greedy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); -- --size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy2_dictMatchState( -+ + +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy +#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState @@ -13442,38 +17346,50 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy_dictMatchState( +-size_t ZSTD_compressBlock_lazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_dictMatchState( -+size_t ZSTD_compressBlock_lazy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2_dictMatchState_row( -+size_t ZSTD_compressBlock_lazy_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy_dictMatchState_row( -+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + size_t ZSTD_compressBlock_lazy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_dictMatchState_row( -+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); - -size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( -+size_t ZSTD_compressBlock_lazy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( -+size_t ZSTD_compressBlock_lazy_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + +#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy +#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row @@ -13496,37 +17412,43 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( -+size_t ZSTD_compressBlock_lazy2_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( -+size_t ZSTD_compressBlock_lazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( -+size_t ZSTD_compressBlock_lazy2_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); - -size_t ZSTD_compressBlock_greedy_extDict( -+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_lazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy2_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + +#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2 +#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row @@ -13549,17 +17471,19 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_btlazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_btlazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); - -+ + +#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2 +#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState +#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict @@ -13568,8 +17492,6 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL +#endif -+ - #endif /* ZSTD_LAZY_H */ --- a/lib/zstd/compress/zstd_ldm.c @@ -13582,7 +17504,98 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZS +@@ -16,7 +17,7 @@ + #include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */ + #include "zstd_ldm_geartab.h" + +-#define LDM_BUCKET_SIZE_LOG 3 ++#define LDM_BUCKET_SIZE_LOG 4 + #define LDM_MIN_MATCH_LENGTH 64 + #define LDM_HASH_RLOG 7 + +@@ -133,21 +134,35 @@ done: + } + + void ZSTD_ldm_adjustParameters(ldmParams_t* params, +- ZSTD_compressionParameters const* cParams) ++ const ZSTD_compressionParameters* cParams) + { + params->windowLog = cParams->windowLog; + ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX); + DEBUGLOG(4, "ZSTD_ldm_adjustParameters"); +- if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG; +- if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH; ++ if (params->hashRateLog == 0) { ++ if (params->hashLog > 0) { ++ /* if params->hashLog is set, derive hashRateLog from it */ ++ assert(params->hashLog <= ZSTD_HASHLOG_MAX); ++ if (params->windowLog > params->hashLog) { ++ params->hashRateLog = params->windowLog - params->hashLog; ++ } ++ } else { ++ assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); ++ /* mapping from [fast, rate7] to [btultra2, rate4] */ ++ params->hashRateLog = 7 - (cParams->strategy/3); ++ } ++ } + if (params->hashLog == 0) { +- params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG); +- assert(params->hashLog <= ZSTD_HASHLOG_MAX); ++ params->hashLog = BOUNDED(ZSTD_HASHLOG_MIN, params->windowLog - params->hashRateLog, ZSTD_HASHLOG_MAX); + } +- if (params->hashRateLog == 0) { +- params->hashRateLog = params->windowLog < params->hashLog +- ? 0 +- : params->windowLog - params->hashLog; ++ if (params->minMatchLength == 0) { ++ params->minMatchLength = LDM_MIN_MATCH_LENGTH; ++ if (cParams->strategy >= ZSTD_btultra) ++ params->minMatchLength /= 2; ++ } ++ if (params->bucketSizeLog==0) { ++ assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); ++ params->bucketSizeLog = BOUNDED(LDM_BUCKET_SIZE_LOG, (U32)cParams->strategy, ZSTD_LDM_BUCKETSIZELOG_MAX); + } + params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog); + } +@@ -170,22 +185,22 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t + /* ZSTD_ldm_getBucket() : + * Returns a pointer to the start of the bucket associated with hash. */ + static ldmEntry_t* ZSTD_ldm_getBucket( +- ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams) ++ const ldmState_t* ldmState, size_t hash, U32 const bucketSizeLog) + { +- return ldmState->hashTable + (hash << ldmParams.bucketSizeLog); ++ return ldmState->hashTable + (hash << bucketSizeLog); + } + + /* ZSTD_ldm_insertEntry() : + * Insert the entry with corresponding hash into the hash table */ + static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, + size_t const hash, const ldmEntry_t entry, +- ldmParams_t const ldmParams) ++ U32 const bucketSizeLog) + { + BYTE* const pOffset = ldmState->bucketOffsets + hash; + unsigned const offset = *pOffset; + +- *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry; +- *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1)); ++ *(ZSTD_ldm_getBucket(ldmState, hash, bucketSizeLog) + offset) = entry; ++ *pOffset = (BYTE)((offset + 1) & ((1u << bucketSizeLog) - 1)); + + } + +@@ -234,7 +249,7 @@ static size_t ZSTD_ldm_countBackwardsMat + * + * The tables for the other strategies are filled within their + * block compressors. */ +-static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, ++static size_t ZSTD_ldm_fillFastTables(ZSTD_MatchState_t* ms, + void const* end) + { + const BYTE* const iend = (const BYTE*)end; +@@ -242,11 +257,15 @@ static size_t ZSTD_ldm_fillFastTables(ZS switch(ms->cParams.strategy) { case ZSTD_fast: @@ -13600,18 +17613,102 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> break; case ZSTD_greedy: -@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZS +@@ -269,7 +288,8 @@ void ZSTD_ldm_fillHashTable( + const BYTE* iend, ldmParams_t const* params) + { + U32 const minMatchLength = params->minMatchLength; +- U32 const hBits = params->hashLog - params->bucketSizeLog; ++ U32 const bucketSizeLog = params->bucketSizeLog; ++ U32 const hBits = params->hashLog - bucketSizeLog; + BYTE const* const base = ldmState->window.base; + BYTE const* const istart = ip; + ldmRollingHashState_t hashState; +@@ -284,7 +304,7 @@ void ZSTD_ldm_fillHashTable( + unsigned n; + + numSplits = 0; +- hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits); ++ hashed = ZSTD_ldm_gear_feed(&hashState, ip, (size_t)(iend - ip), splits, &numSplits); + + for (n = 0; n < numSplits; n++) { + if (ip + splits[n] >= istart + minMatchLength) { +@@ -295,7 +315,7 @@ void ZSTD_ldm_fillHashTable( + + entry.offset = (U32)(split - base); + entry.checksum = (U32)(xxhash >> 32); +- ZSTD_ldm_insertEntry(ldmState, hash, entry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, entry, params->bucketSizeLog); + } + } + +@@ -309,7 +329,7 @@ void ZSTD_ldm_fillHashTable( + * Sets cctx->nextToUpdate to a position corresponding closer to anchor + * if it is far way + * (after a long match, only update tables a limited amount). */ +-static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) ++static void ZSTD_ldm_limitTableUpdate(ZSTD_MatchState_t* ms, const BYTE* anchor) + { + U32 const curr = (U32)(anchor - ms->window.base); + if (curr > ms->nextToUpdate + 1024) { +@@ -318,8 +338,10 @@ static void ZSTD_ldm_limitTableUpdate(ZS } } -static size_t ZSTD_ldm_generateSequences_internal( +- ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_ldm_generateSequences_internal( - ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, ++ ldmState_t* ldmState, RawSeqStore_t* rawSeqStore, ldmParams_t const* params, void const* src, size_t srcSize) { -@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences( + /* LDM parameters */ +@@ -373,7 +395,7 @@ static size_t ZSTD_ldm_generateSequences + candidates[n].split = split; + candidates[n].hash = hash; + candidates[n].checksum = (U32)(xxhash >> 32); +- candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params); ++ candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, params->bucketSizeLog); + PREFETCH_L1(candidates[n].bucket); + } + +@@ -396,7 +418,7 @@ static size_t ZSTD_ldm_generateSequences + * the previous one, we merely register it in the hash table and + * move on */ + if (split < anchor) { +- ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } + +@@ -443,7 +465,7 @@ static size_t ZSTD_ldm_generateSequences + /* No match found -- insert an entry into the hash table + * and process the next candidate match */ + if (bestEntry == NULL) { +- ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } + +@@ -464,7 +486,7 @@ static size_t ZSTD_ldm_generateSequences + + /* Insert the current entry into the hash table --- it must be + * done after the previous block to avoid clobbering bestEntry */ +- ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + + anchor = split + forwardMatchLength; + +@@ -503,7 +525,7 @@ static void ZSTD_ldm_reduceTable(ldmEntr + } + + size_t ZSTD_ldm_generateSequences( +- ldmState_t* ldmState, rawSeqStore_t* sequences, ++ ldmState_t* ldmState, RawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize) + { + U32 const maxDist = 1U << params->windowLog; +@@ -549,7 +571,7 @@ size_t ZSTD_ldm_generateSequences( * the window through early invalidation. * TODO: * Test the chunk size. * * Try invalidation after the sequence generation and test the @@ -13620,7 +17717,53 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * * NOTE: Because of dictionaries + sequence splitting we MUST make sure * that any offset used is valid at the END of the sequence, since it may -@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor +@@ -580,7 +602,7 @@ size_t ZSTD_ldm_generateSequences( + } + + void +-ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) ++ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) + { + while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { + rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; +@@ -616,7 +638,7 @@ ZSTD_ldm_skipSequences(rawSeqStore_t* ra + * Returns the current sequence to handle, or if the rest of the block should + * be literals, it returns a sequence with offset == 0. + */ +-static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, ++static rawSeq maybeSplitSequence(RawSeqStore_t* rawSeqStore, + U32 const remaining, U32 const minMatch) + { + rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos]; +@@ -640,7 +662,7 @@ static rawSeq maybeSplitSequence(rawSeqS + return sequence; + } + +-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { ++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) { + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { + rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; +@@ -657,14 +679,14 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSe + } + } + +-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_paramSwitch_e useRowMatchFinder, ++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_ParamSwitch_e useRowMatchFinder, + void const* src, size_t srcSize) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + unsigned const minMatch = cParams->minMatch; +- ZSTD_blockCompressor const blockCompressor = ++ ZSTD_BlockCompressor_f const blockCompressor = + ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms)); + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; +@@ -689,7 +711,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor /* maybeSplitSequence updates rawSeqStore->pos */ rawSeq const sequence = maybeSplitSequence(rawSeqStore, (U32)(iend - ip), minMatch); @@ -13628,7 +17771,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* End signal */ if (sequence.offset == 0) break; -@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor +@@ -702,6 +723,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor /* Run the block compressor */ DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); { @@ -13636,7 +17779,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t const newLitLength = blockCompressor(ms, seqStore, rep, ip, sequence.litLength); ip += sequence.litLength; -@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor +@@ -711,7 +733,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor rep[0] = sequence.offset; /* Store the sequence */ ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, @@ -13655,6 +17798,60 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the +@@ -11,7 +12,6 @@ + #ifndef ZSTD_LDM_H + #define ZSTD_LDM_H + +- + #include "zstd_compress_internal.h" /* ldmParams_t, U32 */ + #include <linux/zstd.h> /* ZSTD_CCtx, size_t */ + +@@ -40,7 +40,7 @@ void ZSTD_ldm_fillHashTable( + * sequences. + */ + size_t ZSTD_ldm_generateSequences( +- ldmState_t* ldms, rawSeqStore_t* sequences, ++ ldmState_t* ldms, RawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize); + + /* +@@ -61,9 +61,9 @@ size_t ZSTD_ldm_generateSequences( + * two. We handle that case correctly, and update `rawSeqStore` appropriately. + * NOTE: This function does not return any errors. + */ +-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_paramSwitch_e useRowMatchFinder, ++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_ParamSwitch_e useRowMatchFinder, + void const* src, size_t srcSize); + + /* +@@ -73,7 +73,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStor + * Avoids emitting matches less than `minMatch` bytes. + * Must be called for data that is not passed to ZSTD_ldm_blockCompress(). + */ +-void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, ++void ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, + U32 const minMatch); + + /* ZSTD_ldm_skipRawSeqStoreBytes(): +@@ -81,7 +81,7 @@ void ZSTD_ldm_skipSequences(rawSeqStore_ + * Not to be used in conjunction with ZSTD_ldm_skipSequences(). + * Must be called for data with is not passed to ZSTD_ldm_blockCompress(). + */ +-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes); ++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes); + + /* ZSTD_ldm_getTableSize() : + * Estimate the space needed for long distance matching tables or 0 if LDM is +@@ -107,5 +107,4 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t + void ZSTD_ldm_adjustParameters(ldmParams_t* params, + ZSTD_compressionParameters const* cParams); + +- + #endif /* ZSTD_FAST_H */ --- a/lib/zstd/compress/zstd_ldm_geartab.h +++ b/lib/zstd/compress/zstd_ldm_geartab.h @@ -1,5 +1,6 @@ @@ -13943,32 +18140,36 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - const BYTE* const ip) +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, ++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip) { U32* const hashTable3 = ms->hashTable3; U32 const hashLog3 = ms->hashLog3; -@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexH +@@ -408,8 +438,10 @@ static U32 ZSTD_insertAndFindFirstIndexH * @param ip assumed <= iend-8 . * @param target The target of ZSTD_updateTree_internal() - we are filling to this position * @return : nb of positions added */ -static U32 ZSTD_insertBt1( +- const ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertBt1( - const ZSTD_matchState_t* ms, ++ const ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iend, U32 const target, -@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1( + U32 const mls, const int extDict) +@@ -527,15 +559,16 @@ static U32 ZSTD_insertBt1( } FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR void ZSTD_updateTree_internal( - ZSTD_matchState_t* ms, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iend, -@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal( + const U32 mls, const ZSTD_dictMode_e dictMode) + { const BYTE* const base = ms->window.base; U32 const target = (U32)(ip - base); U32 idx = ms->nextToUpdate; @@ -13977,7 +18178,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> idx, target, dictMode); while(idx < target) { -@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t* +@@ -548,20 +581,23 @@ void ZSTD_updateTree_internal( + ms->nextToUpdate = target; + } + +-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { ++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend) { + ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); } FORCE_INLINE_TEMPLATE @@ -13994,7 +18201,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +U32 +ZSTD_insertBtAndGetAllMatches ( + ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ -+ ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip, const BYTE* const iLimit, + const ZSTD_dictMode_e dictMode, @@ -14005,6 +18212,31 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); +@@ -590,7 +626,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + U32 mnum = 0; + U32 nbCompares = 1U << cParams->searchLog; + +- const ZSTD_matchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; ++ const ZSTD_MatchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; + const ZSTD_compressionParameters* const dmsCParams = + dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL; + const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL; +@@ -629,13 +665,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + assert(curr >= windowLow); + if ( dictMode == ZSTD_extDict + && ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow) /* equivalent to `curr > repIndex >= windowLow` */ +- & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */) ++ & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch; + } + if (dictMode == ZSTD_dictMatchState + && ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `curr > repIndex >= dmsLowLimit` */ +- & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch; + } } @@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", repCode, ll0, repOffset, repLen); @@ -14056,7 +18288,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> matches[mnum].len = (U32)matchLength; mnum++; if ( (matchLength > ZSTD_OPT_NUM) -@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)( +@@ -784,7 +820,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + typedef U32 (*ZSTD_getAllMatchesFn)( + ZSTD_match_t*, +- ZSTD_matchState_t*, ++ ZSTD_MatchState_t*, + U32*, + const BYTE*, + const BYTE*, +@@ -792,9 +828,11 @@ typedef U32 (*ZSTD_getAllMatchesFn)( U32 const ll0, U32 const lengthToBeat); @@ -14065,10 +18306,63 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_btGetAllMatches_internal( ZSTD_match_t* matches, - ZSTD_matchState_t* ms, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, U32* nextToUpdate3, -@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZS - const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) + const BYTE* ip, + const BYTE* const iHighLimit, +@@ -817,7 +855,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllM + #define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \ + static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \ + ZSTD_match_t* matches, \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + U32* nextToUpdate3, \ + const BYTE* ip, \ + const BYTE* const iHighLimit, \ +@@ -849,7 +887,7 @@ GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchSta + } + + static ZSTD_getAllMatchesFn +-ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode) ++ZSTD_selectBtGetAllMatches(ZSTD_MatchState_t const* ms, ZSTD_dictMode_e const dictMode) + { + ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = { + ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict), +@@ -868,7 +906,7 @@ ZSTD_selectBtGetAllMatches(ZSTD_matchSta + + /* Struct containing info needed to make decision about ldm inclusion */ + typedef struct { +- rawSeqStore_t seqStore; /* External match candidates store for this block */ ++ RawSeqStore_t seqStore; /* External match candidates store for this block */ + U32 startPosInBlock; /* Start position of the current match candidate */ + U32 endPosInBlock; /* End position of the current match candidate */ + U32 offset; /* Offset of the match candidate */ +@@ -878,7 +916,7 @@ typedef struct { + * Moves forward in @rawSeqStore by @nbBytes, + * which will update the fields 'pos' and 'posInSequence'. + */ +-static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) ++static void ZSTD_optLdm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) + { + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { +@@ -935,7 +973,7 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(Z + return; + } + +- /* Matches may be < MINMATCH by this process. In that case, we will reject them ++ /* Matches may be < minMatch by this process. In that case, we will reject them + when we are deciding whether or not to add the ldm */ + optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining; + optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining; +@@ -957,25 +995,26 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(Z + * into 'matches'. Maintains the correct ordering of 'matches'. + */ + static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, +- const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) ++ const ZSTD_optLdm_t* optLdm, U32 currPosInBlock, ++ U32 minMatch) { U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; - /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ @@ -14076,7 +18370,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; /* Ensure that current block position is not outside of the match */ -@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZS + if (currPosInBlock < optLdm->startPosInBlock + || currPosInBlock >= optLdm->endPosInBlock +- || candidateMatchLength < MINMATCH) { ++ || candidateMatchLength < minMatch) { + return; } if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { @@ -14092,7 +18390,26 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> (*nbMatches)++; } } -@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_o +@@ -986,7 +1025,8 @@ static void ZSTD_optLdm_maybeAddMatch(ZS + static void + ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + ZSTD_match_t* matches, U32* nbMatches, +- U32 currPosInBlock, U32 remainingBytes) ++ U32 currPosInBlock, U32 remainingBytes, ++ U32 minMatch) + { + if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) { + return; +@@ -1003,7 +1043,7 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_o + } + ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes); + } +- ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock); ++ ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock, minMatch); + } + + +@@ -1011,11 +1051,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_o * Optimal parser *********************************/ @@ -14104,11 +18421,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #if 0 /* debug */ static void -@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltI +@@ -1033,9 +1068,15 @@ listStats(const U32* table, int lastEltI #endif -FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, +- seqStore_t* seqStore, +#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) +#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) +#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) @@ -14116,10 +18435,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t - ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - seqStore_t* seqStore, ++ZSTD_compressBlock_opt_generic(ZSTD_MatchState_t* ms, ++ SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], -@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc + const void* src, size_t srcSize, + const int optLevel, +@@ -1059,9 +1100,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc ZSTD_optimal_t* const opt = optStatePtr->priceTable; ZSTD_match_t* const matches = optStatePtr->matchTable; @@ -14132,13 +18453,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); -@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc +@@ -1082,103 +1125,140 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc U32 const ll0 = !litlen; U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, - (U32)(ip-istart), (U32)(iend - ip)); - if (!nbMatches) { ip++; continue; } -+ (U32)(ip-istart), (U32)(iend-ip)); ++ (U32)(ip-istart), (U32)(iend-ip), ++ minMatch); + if (!nbMatches) { + DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); + ip++; @@ -14240,7 +18562,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - assert(cur < ZSTD_OPT_NUM); - DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) + assert(cur <= ZSTD_OPT_NUM); -+ DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur); ++ DEBUGLOG(7, "cPos:%i==rPos:%u", (int)(inr-istart), cur); /* Fix current position with one literal if cheaper */ - { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; @@ -14253,9 +18575,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + + LL_INCPRICE(litlen); assert(price < 1000000000); /* overflow check */ if (price <= opt[cur].price) { +- DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", +- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + ZSTD_optimal_t const prevMatch = opt[cur]; - DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", - inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, ++ DEBUGLOG(7, "cPos:%i==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", ++ (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); - opt[cur].mlen = 0; - opt[cur].off = 0; @@ -14276,13 +18600,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + && (with1literal < opt[cur+1].price) ) { + /* update offset history - before it disappears */ + U32 const prev = cur - prevMatch.mlen; -+ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); ++ Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); + assert(cur >= prevMatch.mlen); + DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", + ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), + newReps.rep[0], newReps.rep[1], newReps.rep[2] ); + opt[cur+1] = prevMatch; /* mlen & offbase */ -+ ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t)); ++ ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(Repcodes_t)); + opt[cur+1].litlen = 1; + opt[cur+1].price = with1literal; + if (last_pos < cur+1) last_pos = cur+1; @@ -14292,8 +18616,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", - inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), - opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); -+ DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)", -+ inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); ++ DEBUGLOG(7, "cPos:%i==rPos:%u : literal would cost more (%.2f>%.2f)", ++ (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); } } @@ -14304,21 +18628,23 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + /* Offset history is not updated during match comparison. + * Do it here, now that the match is selected and confirmed. */ - ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); +- ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); ++ ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(Repcodes_t)); assert(cur >= opt[cur].mlen); - if (opt[cur].mlen != 0) { + if (opt[cur].litlen == 0) { + /* just finished a match => alter offset history */ U32 const prev = cur - opt[cur].mlen; - repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); -+ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); - ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); +- ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); - } else { - ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); ++ Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); ++ ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(Repcodes_t)); } /* last match must start at a minimum distance of 8 from oend */ -@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc +@@ -1188,38 +1268,37 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc if ( (optLevel==0) /*static_test*/ && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { @@ -14338,7 +18664,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); U32 matchNb; -@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, +- (U32)(inr-istart), (U32)(iend-inr)); ++ (U32)(inr-istart), (U32)(iend-inr), ++ minMatch); + + if (!nbMatches) { + DEBUGLOG(7, "rPos:%u : no match found", cur); continue; } @@ -14355,8 +18687,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - last_pos = cur + ZSTD_totalLen(lastSequence); - if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ + { U32 const longestML = matches[nbMatches-1].len; -+ DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u", -+ inr-istart, cur, nbMatches, longestML); ++ DEBUGLOG(7, "cPos:%i==rPos:%u, found %u matches, of longest ML=%u", ++ (int)(inr-istart), cur, nbMatches, longestML); + + if ( (longestML > sufficient_len) + || (cur + longestML >= ZSTD_OPT_NUM) @@ -14368,7 +18700,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> goto _shortestPath; } } -@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc +@@ -1230,20 +1309,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; U32 mlen; @@ -14399,7 +18731,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> opt[pos].price = price; } else { DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", -@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc +@@ -1251,55 +1335,89 @@ ZSTD_compressBlock_opt_generic(ZSTD_matc if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ } } } } @@ -14436,11 +18768,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + /* Update offset history */ + if (lastStretch.litlen == 0) { + /* finishing on a match : update offset history */ -+ repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); -+ ZSTD_memcpy(rep, &reps, sizeof(repcodes_t)); ++ Repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); ++ ZSTD_memcpy(rep, &reps, sizeof(Repcodes_t)); } else { - ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); -+ ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(rep, lastStretch.rep, sizeof(Repcodes_t)); + assert(cur >= lastStretch.litlen); + cur -= lastStretch.litlen; } @@ -14509,9 +18841,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - U32 const offCode = opt[storePos].off; + U32 const offBase = opt[storePos].off; U32 const advance = llen + mlen; - DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", - anchor - istart, (unsigned)llen, (unsigned)mlen); -@@ -1308,11 +1422,14 @@ _shortestPath: /* cur, last_pos, best_ +- DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", +- anchor - istart, (unsigned)llen, (unsigned)mlen); ++ DEBUGLOG(6, "considering seq starting at %i, llen=%u, mlen=%u", ++ (int)(anchor - istart), (unsigned)llen, (unsigned)mlen); + + if (mlen==0) { /* only literals => must be last "sequence", actually starting a new stream of sequences */ + assert(storePos == storeEnd); /* must be last sequence */ +@@ -1308,11 +1426,14 @@ _shortestPath: /* cur, last_pos, best_ } assert(anchor + llen <= iend); @@ -14528,7 +18865,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_setBasePrices(optStatePtr, optLevel); } } /* while (ip < ilimit) */ -@@ -1320,21 +1437,27 @@ _shortestPath: /* cur, last_pos, best_ +@@ -1320,42 +1441,51 @@ _shortestPath: /* cur, last_pos, best_ /* Return the last literals size */ return (size_t)(iend - anchor); } @@ -14536,7 +18873,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR static size_t ZSTD_compressBlock_opt0( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) { return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); @@ -14545,7 +18883,8 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR static size_t ZSTD_compressBlock_opt2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) { return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); @@ -14554,9 +18893,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btopt( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) -@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt( + { DEBUGLOG(5, "ZSTD_compressBlock_btopt"); return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); } @@ -14579,14 +18919,14 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> - const void* src, size_t srcSize) +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_initStats_ultra(ZSTD_matchState_t* ms, -+ seqStore_t* seqStore, ++void ZSTD_initStats_ultra(ZSTD_MatchState_t* ms, ++ SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) { U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); -@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* +@@ -1368,7 +1498,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ @@ -14595,7 +18935,23 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_resetSeqStore(seqStore); ms->window.base -= srcSize; ms->window.dictLimit += (U32)srcSize; -@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2( +@@ -1378,7 +1508,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* + } + + size_t ZSTD_compressBlock_btultra( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); +@@ -1386,16 +1516,16 @@ size_t ZSTD_compressBlock_btultra( + } + + size_t ZSTD_compressBlock_btultra2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { U32 const curr = (U32)((const BYTE*)src - ms->window.base); DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); @@ -14609,7 +18965,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * Consequently, this can only work if no data has been previously loaded in tables, * aka, no dictionary, no prefix, no ldm preprocessing. * The compression ratio gain is generally small (~0.5% on first block), -@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2( +@@ -1404,42 +1534,47 @@ size_t ZSTD_compressBlock_btultra2( if ( (ms->opt.litLengthSum==0) /* first block */ && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ @@ -14627,15 +18983,17 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btopt_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) -@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatc + { return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); } -size_t ZSTD_compressBlock_btultra_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_btopt_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); @@ -14644,9 +19002,10 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif -size_t ZSTD_compressBlock_btopt_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btultra_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); @@ -14654,7 +19013,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } size_t ZSTD_compressBlock_btultra_extDict( -@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDic +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) { return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); } @@ -14672,28 +19033,35 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the -@@ -14,30 +15,40 @@ +@@ -11,40 +12,62 @@ + #ifndef ZSTD_OPT_H + #define ZSTD_OPT_H +- #include "zstd_compress_internal.h" +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) /* used in ZSTD_loadDictionaryContent() */ - void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); +-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); ++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend); +#endif +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btopt( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_btultra( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_btopt_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_btultra2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_btopt_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt @@ -14706,28 +19074,31 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#endif -size_t ZSTD_compressBlock_btopt_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btultra( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_btultra_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); - -size_t ZSTD_compressBlock_btopt_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], -- void const* src, size_t srcSize); - size_t ZSTD_compressBlock_btultra_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDic + size_t ZSTD_compressBlock_btultra_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + /* note : no btultra2 variant for extDict nor dictMatchState, * because btultra2 is not meant to work with dictionaries * and is only specific for the first block (no prefix) */ +size_t ZSTD_compressBlock_btultra2( -+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); -+ + +#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra +#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState +#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict @@ -14739,8 +19110,286 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL +#endif - #endif /* ZSTD_OPT_H */ +--- /dev/null ++++ b/lib/zstd/compress/zstd_preSplit.c +@@ -0,0 +1,239 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#include "../common/compiler.h" /* ZSTD_ALIGNOF */ ++#include "../common/mem.h" /* S64 */ ++#include "../common/zstd_deps.h" /* ZSTD_memset */ ++#include "../common/zstd_internal.h" /* ZSTD_STATIC_ASSERT */ ++#include "hist.h" /* HIST_add */ ++#include "zstd_preSplit.h" ++ ++ ++#define BLOCKSIZE_MIN 3500 ++#define THRESHOLD_PENALTY_RATE 16 ++#define THRESHOLD_BASE (THRESHOLD_PENALTY_RATE - 2) ++#define THRESHOLD_PENALTY 3 ++ ++#define HASHLENGTH 2 ++#define HASHLOG_MAX 10 ++#define HASHTABLESIZE (1 << HASHLOG_MAX) ++#define HASHMASK (HASHTABLESIZE - 1) ++#define KNUTH 0x9e3779b9 ++ ++/* for hashLog > 8, hash 2 bytes. ++ * for hashLog == 8, just take the byte, no hashing. ++ * The speed of this method relies on compile-time constant propagation */ ++FORCE_INLINE_TEMPLATE unsigned hash2(const void *p, unsigned hashLog) ++{ ++ assert(hashLog >= 8); ++ if (hashLog == 8) return (U32)((const BYTE*)p)[0]; ++ assert(hashLog <= HASHLOG_MAX); ++ return (U32)(MEM_read16(p)) * KNUTH >> (32 - hashLog); ++} ++ ++ ++typedef struct { ++ unsigned events[HASHTABLESIZE]; ++ size_t nbEvents; ++} Fingerprint; ++typedef struct { ++ Fingerprint pastEvents; ++ Fingerprint newEvents; ++} FPStats; ++ ++static void initStats(FPStats* fpstats) ++{ ++ ZSTD_memset(fpstats, 0, sizeof(FPStats)); ++} ++ ++FORCE_INLINE_TEMPLATE void ++addEvents_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog) ++{ ++ const char* p = (const char*)src; ++ size_t limit = srcSize - HASHLENGTH + 1; ++ size_t n; ++ assert(srcSize >= HASHLENGTH); ++ for (n = 0; n < limit; n+=samplingRate) { ++ fp->events[hash2(p+n, hashLog)]++; ++ } ++ fp->nbEvents += limit/samplingRate; ++} ++ ++FORCE_INLINE_TEMPLATE void ++recordFingerprint_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog) ++{ ++ ZSTD_memset(fp, 0, sizeof(unsigned) * ((size_t)1 << hashLog)); ++ fp->nbEvents = 0; ++ addEvents_generic(fp, src, srcSize, samplingRate, hashLog); ++} ++ ++typedef void (*RecordEvents_f)(Fingerprint* fp, const void* src, size_t srcSize); ++ ++#define FP_RECORD(_rate) ZSTD_recordFingerprint_##_rate ++ ++#define ZSTD_GEN_RECORD_FINGERPRINT(_rate, _hSize) \ ++ static void FP_RECORD(_rate)(Fingerprint* fp, const void* src, size_t srcSize) \ ++ { \ ++ recordFingerprint_generic(fp, src, srcSize, _rate, _hSize); \ ++ } ++ ++ZSTD_GEN_RECORD_FINGERPRINT(1, 10) ++ZSTD_GEN_RECORD_FINGERPRINT(5, 10) ++ZSTD_GEN_RECORD_FINGERPRINT(11, 9) ++ZSTD_GEN_RECORD_FINGERPRINT(43, 8) ++ ++ ++static U64 abs64(S64 s64) { return (U64)((s64 < 0) ? -s64 : s64); } ++ ++static U64 fpDistance(const Fingerprint* fp1, const Fingerprint* fp2, unsigned hashLog) ++{ ++ U64 distance = 0; ++ size_t n; ++ assert(hashLog <= HASHLOG_MAX); ++ for (n = 0; n < ((size_t)1 << hashLog); n++) { ++ distance += ++ abs64((S64)fp1->events[n] * (S64)fp2->nbEvents - (S64)fp2->events[n] * (S64)fp1->nbEvents); ++ } ++ return distance; ++} ++ ++/* Compare newEvents with pastEvents ++ * return 1 when considered "too different" ++ */ ++static int compareFingerprints(const Fingerprint* ref, ++ const Fingerprint* newfp, ++ int penalty, ++ unsigned hashLog) ++{ ++ assert(ref->nbEvents > 0); ++ assert(newfp->nbEvents > 0); ++ { U64 p50 = (U64)ref->nbEvents * (U64)newfp->nbEvents; ++ U64 deviation = fpDistance(ref, newfp, hashLog); ++ U64 threshold = p50 * (U64)(THRESHOLD_BASE + penalty) / THRESHOLD_PENALTY_RATE; ++ return deviation >= threshold; ++ } ++} ++ ++static void mergeEvents(Fingerprint* acc, const Fingerprint* newfp) ++{ ++ size_t n; ++ for (n = 0; n < HASHTABLESIZE; n++) { ++ acc->events[n] += newfp->events[n]; ++ } ++ acc->nbEvents += newfp->nbEvents; ++} ++ ++static void flushEvents(FPStats* fpstats) ++{ ++ size_t n; ++ for (n = 0; n < HASHTABLESIZE; n++) { ++ fpstats->pastEvents.events[n] = fpstats->newEvents.events[n]; ++ } ++ fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents; ++ ZSTD_memset(&fpstats->newEvents, 0, sizeof(fpstats->newEvents)); ++} ++ ++static void removeEvents(Fingerprint* acc, const Fingerprint* slice) ++{ ++ size_t n; ++ for (n = 0; n < HASHTABLESIZE; n++) { ++ assert(acc->events[n] >= slice->events[n]); ++ acc->events[n] -= slice->events[n]; ++ } ++ acc->nbEvents -= slice->nbEvents; ++} ++ ++#define CHUNKSIZE (8 << 10) ++static size_t ZSTD_splitBlock_byChunks(const void* blockStart, size_t blockSize, ++ int level, ++ void* workspace, size_t wkspSize) ++{ ++ static const RecordEvents_f records_fs[] = { ++ FP_RECORD(43), FP_RECORD(11), FP_RECORD(5), FP_RECORD(1) ++ }; ++ static const unsigned hashParams[] = { 8, 9, 10, 10 }; ++ const RecordEvents_f record_f = (assert(0<=level && level<=3), records_fs[level]); ++ FPStats* const fpstats = (FPStats*)workspace; ++ const char* p = (const char*)blockStart; ++ int penalty = THRESHOLD_PENALTY; ++ size_t pos = 0; ++ assert(blockSize == (128 << 10)); ++ assert(workspace != NULL); ++ assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0); ++ ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats)); ++ assert(wkspSize >= sizeof(FPStats)); (void)wkspSize; ++ ++ initStats(fpstats); ++ record_f(&fpstats->pastEvents, p, CHUNKSIZE); ++ for (pos = CHUNKSIZE; pos <= blockSize - CHUNKSIZE; pos += CHUNKSIZE) { ++ record_f(&fpstats->newEvents, p + pos, CHUNKSIZE); ++ if (compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, penalty, hashParams[level])) { ++ return pos; ++ } else { ++ mergeEvents(&fpstats->pastEvents, &fpstats->newEvents); ++ if (penalty > 0) penalty--; ++ } ++ } ++ assert(pos == blockSize); ++ return blockSize; ++ (void)flushEvents; (void)removeEvents; ++} ++ ++/* ZSTD_splitBlock_fromBorders(): very fast strategy : ++ * compare fingerprint from beginning and end of the block, ++ * derive from their difference if it's preferable to split in the middle, ++ * repeat the process a second time, for finer grained decision. ++ * 3 times did not brought improvements, so I stopped at 2. ++ * Benefits are good enough for a cheap heuristic. ++ * More accurate splitting saves more, but speed impact is also more perceptible. ++ * For better accuracy, use more elaborate variant *_byChunks. ++ */ ++static size_t ZSTD_splitBlock_fromBorders(const void* blockStart, size_t blockSize, ++ void* workspace, size_t wkspSize) ++{ ++#define SEGMENT_SIZE 512 ++ FPStats* const fpstats = (FPStats*)workspace; ++ Fingerprint* middleEvents = (Fingerprint*)(void*)((char*)workspace + 512 * sizeof(unsigned)); ++ assert(blockSize == (128 << 10)); ++ assert(workspace != NULL); ++ assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0); ++ ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats)); ++ assert(wkspSize >= sizeof(FPStats)); (void)wkspSize; ++ ++ initStats(fpstats); ++ HIST_add(fpstats->pastEvents.events, blockStart, SEGMENT_SIZE); ++ HIST_add(fpstats->newEvents.events, (const char*)blockStart + blockSize - SEGMENT_SIZE, SEGMENT_SIZE); ++ fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents = SEGMENT_SIZE; ++ if (!compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, 0, 8)) ++ return blockSize; ++ ++ HIST_add(middleEvents->events, (const char*)blockStart + blockSize/2 - SEGMENT_SIZE/2, SEGMENT_SIZE); ++ middleEvents->nbEvents = SEGMENT_SIZE; ++ { U64 const distFromBegin = fpDistance(&fpstats->pastEvents, middleEvents, 8); ++ U64 const distFromEnd = fpDistance(&fpstats->newEvents, middleEvents, 8); ++ U64 const minDistance = SEGMENT_SIZE * SEGMENT_SIZE / 3; ++ if (abs64((S64)distFromBegin - (S64)distFromEnd) < minDistance) ++ return 64 KB; ++ return (distFromBegin > distFromEnd) ? 32 KB : 96 KB; ++ } ++} ++ ++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize, ++ int level, ++ void* workspace, size_t wkspSize) ++{ ++ DEBUGLOG(6, "ZSTD_splitBlock (level=%i)", level); ++ assert(0<=level && level<=4); ++ if (level == 0) ++ return ZSTD_splitBlock_fromBorders(blockStart, blockSize, workspace, wkspSize); ++ /* level >= 1*/ ++ return ZSTD_splitBlock_byChunks(blockStart, blockSize, level-1, workspace, wkspSize); ++} +--- /dev/null ++++ b/lib/zstd/compress/zstd_preSplit.h +@@ -0,0 +1,34 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_PRESPLIT_H ++#define ZSTD_PRESPLIT_H ++ ++#include <linux/types.h> /* size_t */ ++ ++#define ZSTD_SLIPBLOCK_WORKSPACESIZE 8208 ++ ++/* ZSTD_splitBlock(): ++ * @level must be a value between 0 and 4. ++ * higher levels spend more energy to detect block boundaries. ++ * @workspace must be aligned for size_t. ++ * @wkspSize must be at least >= ZSTD_SLIPBLOCK_WORKSPACESIZE ++ * note: ++ * For the time being, this function only accepts full 128 KB blocks. ++ * Therefore, @blockSize must be == 128 KB. ++ * While this could be extended to smaller sizes in the future, ++ * it is not yet clear if this would be useful. TBD. ++ */ ++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize, ++ int level, ++ void* workspace, size_t wkspSize); ++ ++#endif /* ZSTD_PRESPLIT_H */ --- a/lib/zstd/decompress/huf_decompress.c +++ b/lib/zstd/decompress/huf_decompress.c @@ -1,7 +1,8 @@ @@ -16249,8 +20898,9 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * @return : 0, `zfhPtr` is correctly filled, * >0, `srcSize` is too small, value is wanted `srcSize` amount, - * or an error code, which can be tested using ZSTD_isError() */ +-size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) +** or an error code, which can be tested using ZSTD_isError() */ - size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) ++size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) { const BYTE* ip = (const BYTE*)src; size_t const minInputSize = ZSTD_startingInputLength(format); @@ -16289,7 +20939,37 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if ( (format != ZSTD_f_zstd1_magicless) && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { -@@ -540,49 +570,52 @@ static size_t readSkippableFrameSize(voi +@@ -438,8 +468,10 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD + if (srcSize < ZSTD_SKIPPABLEHEADERSIZE) + return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */ + ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); +- zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + zfhPtr->frameType = ZSTD_skippableFrame; ++ zfhPtr->dictID = MEM_readLE32(src) - ZSTD_MAGIC_SKIPPABLE_START; ++ zfhPtr->headerSize = ZSTD_SKIPPABLEHEADERSIZE; ++ zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + return 0; + } + RETURN_ERROR(prefix_unknown, ""); +@@ -508,7 +540,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +-size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize) ++size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize) + { + return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); + } +@@ -520,7 +552,7 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHea + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */ + unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) + { +- { ZSTD_frameHeader zfh; ++ { ZSTD_FrameHeader zfh; + if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0) + return ZSTD_CONTENTSIZE_ERROR; + if (zfh.frameType == ZSTD_skippableFrame) { +@@ -540,49 +572,52 @@ static size_t readSkippableFrameSize(voi sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, frameParameter_unsupported, ""); @@ -16365,7 +21045,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) { unsigned long long totalDstSize = 0; -@@ -592,9 +625,7 @@ unsigned long long ZSTD_findDecompressed +@@ -592,9 +627,7 @@ unsigned long long ZSTD_findDecompressed if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { size_t const skippableSize = readSkippableFrameSize(src, srcSize); @@ -16376,7 +21056,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> assert(skippableSize <= srcSize); src = (const BYTE *)src + skippableSize; -@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressed +@@ -602,17 +635,17 @@ unsigned long long ZSTD_findDecompressed continue; } @@ -16402,7 +21082,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> src = (const BYTE *)src + frameSrcSize; srcSize -= frameSrcSize; -@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFram +@@ -676,13 +709,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFram return frameSizeInfo; } @@ -16418,8 +21098,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); assert(ZSTD_isError(frameSizeInfo.compressedSize) || -@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrame - ZSTD_frameHeader zfh; +@@ -693,10 +726,10 @@ static ZSTD_frameSizeInfo ZSTD_findFrame + const BYTE* const ipstart = ip; + size_t remainingSize = srcSize; + size_t nbBlocks = 0; +- ZSTD_frameHeader zfh; ++ ZSTD_FrameHeader zfh; /* Extract Frame Header */ - { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); @@ -16427,7 +21111,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (ZSTD_isError(ret)) return ZSTD_errorFrameSizeInfo(ret); if (ret > 0) -@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrame +@@ -730,28 +763,31 @@ static ZSTD_frameSizeInfo ZSTD_findFrame ip += 4; } @@ -16461,7 +21145,13 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* ZSTD_decompressBound() : -@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound( + * compatible with legacy mode +- * `src` must point to the start of a ZSTD frame or a skippeable frame ++ * `src` must point to the start of a ZSTD frame or a skippable frame + * `srcSize` must be at least as large as the frame contained + * @return : the maximum decompressed size of the compressed source + */ +@@ -760,7 +796,7 @@ unsigned long long ZSTD_decompressBound( unsigned long long bound = 0; /* Iterate over each frame */ while (srcSize > 0) { @@ -16470,7 +21160,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t const compressedSize = frameSizeInfo.compressedSize; unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) -@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound( +@@ -773,6 +809,48 @@ unsigned long long ZSTD_decompressBound( return bound; } @@ -16484,7 +21174,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; -+ ZSTD_frameHeader zfh; ++ ZSTD_FrameHeader zfh; + + FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) @@ -16519,7 +21209,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /*-************************************************************* * Frame decoding -@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_ +@@ -815,7 +893,7 @@ static size_t ZSTD_setRleBlock(void* dst + return regenSize; + } + +-static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming) ++static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, int streaming) + { + (void)dctx; + (void)uncompressedSize; +@@ -856,6 +934,10 @@ static size_t ZSTD_decompressFrame(ZSTD_ ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; } @@ -16530,7 +21229,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Loop on each block */ while (1) { BYTE* oBlockEnd = oend; -@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_ +@@ -888,7 +970,8 @@ static size_t ZSTD_decompressFrame(ZSTD_ switch(blockProperties.blockType) { case bt_compressed: @@ -16540,7 +21239,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> break; case bt_raw : /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ -@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_ +@@ -901,12 +984,14 @@ static size_t ZSTD_decompressFrame(ZSTD_ default: RETURN_ERROR(corruption_detected, "invalid block type"); } @@ -16559,11 +21258,11 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> assert(ip != NULL); ip += cBlockSize; remainingSrcSize -= cBlockSize; -@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_ +@@ -930,12 +1015,15 @@ static size_t ZSTD_decompressFrame(ZSTD_ } ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); /* Allow caller to get size read */ -+ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr); ++ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %i, consuming %i bytes of input", (int)(op-ostart), (int)(ip - (const BYTE*)*srcPtr)); *srcPtr = ip; *srcSizePtr = remainingSrcSize; return (size_t)(op-ostart); @@ -16576,7 +21275,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict, size_t dictSize, -@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame( +@@ -955,17 +1043,18 @@ static size_t ZSTD_decompressMultiFrame( while (srcSize >= ZSTD_startingInputLength(dctx->format)) { @@ -16600,7 +21299,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } if (ddict) { -@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t +@@ -1061,8 +1150,8 @@ size_t ZSTD_decompress(void* dst, size_t size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } /* @@ -16611,7 +21310,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * be streamed. * * For blocks that can be streamed, this allows us to reduce the latency until we produce -@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx +@@ -1181,7 +1270,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx { case bt_compressed: DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); @@ -16621,7 +21320,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> dctx->expected = 0; /* Streaming not supported */ break; case bt_raw : -@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx +@@ -1250,6 +1340,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx case ZSTDds_decodeSkippableHeader: assert(src != NULL); assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); @@ -16629,7 +21328,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ dctx->stage = ZSTDds_skipFrame; -@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx +@@ -1262,7 +1353,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx default: assert(0); /* impossible */ @@ -16638,7 +21337,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } -@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* +@@ -1303,11 +1394,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* /* in minimal huffman, we always use X1 variants */ size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, dictPtr, dictEnd - dictPtr, @@ -16652,7 +21351,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #endif RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); dictPtr += hSize; -@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* d +@@ -1403,10 +1494,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* d dctx->prefixStart = NULL; dctx->virtualStart = NULL; dctx->dictEnd = NULL; @@ -16665,7 +21364,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ dctx->LLTptr = dctx->entropy.LLTable; -@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const v +@@ -1465,7 +1557,7 @@ unsigned ZSTD_getDictID_fromDict(const v * This could for one of the following reasons : * - The frame does not require a dictionary (most common case). * - The frame was built with dictID intentionally removed. @@ -16674,16 +21373,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * Note : this use case also happens when using a non-conformant dictionary. * - `srcSize` is too small, and as a result, frame header could not be decoded. * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. -@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const v +@@ -1474,7 +1566,7 @@ unsigned ZSTD_getDictID_fromDict(const v * ZSTD_getFrameHeader(), which will provide a more precise error code. */ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) { - ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; -+ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; ++ ZSTD_FrameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); if (ZSTD_isError(hError)) return 0; return zfp.dictID; -@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_D +@@ -1581,7 +1673,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_D size_t ZSTD_initDStream(ZSTD_DStream* zds) { DEBUGLOG(4, "ZSTD_initDStream"); @@ -16694,7 +21393,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } /* ZSTD_initDStream_usingDDict() : -@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zd +@@ -1589,6 +1683,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zd * this function cannot fail */ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) { @@ -16702,7 +21401,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); return ZSTD_startingInputLength(dctx->format); -@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_ +@@ -1599,6 +1694,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_ * this function cannot fail */ size_t ZSTD_resetDStream(ZSTD_DStream* dctx) { @@ -16710,7 +21409,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); return ZSTD_startingInputLength(dctx->format); } -@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_d +@@ -1670,6 +1766,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_d bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; return bounds; @@ -16726,7 +21425,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> default:; } bounds.error = ERROR(parameter_unsupported); -@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* +@@ -1710,6 +1815,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* case ZSTD_d_refMultipleDDicts: *value = (int)dctx->refMultipleDDicts; return 0; @@ -16739,7 +21438,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> default:; } RETURN_ERROR(parameter_unsupported, ""); -@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* +@@ -1743,6 +1854,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* } dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; return 0; @@ -16754,7 +21453,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> default:; } RETURN_ERROR(parameter_unsupported, ""); -@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, +@@ -1754,6 +1873,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, || (reset == ZSTD_reset_session_and_parameters) ) { dctx->streamStage = zdss_init; dctx->noForwardProgress = 0; @@ -16762,7 +21461,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } if ( (reset == ZSTD_reset_parameters) || (reset == ZSTD_reset_session_and_parameters) ) { -@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DS +@@ -1770,11 +1890,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DS return ZSTD_sizeof_DCtx(dctx); } @@ -16784,7 +21483,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); size_t const minRBSize = (size_t) neededSize; RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, -@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsig +@@ -1782,6 +1908,11 @@ size_t ZSTD_decodingBufferSize_min(unsig return minRBSize; } @@ -16796,7 +21495,24 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> size_t ZSTD_estimateDStreamSize(size_t windowSize) { size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); -@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -1793,7 +1924,7 @@ size_t ZSTD_estimateDStreamSize(size_t w + size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize) + { + U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable, but requires an additional parameter (or a dctx) */ +- ZSTD_frameHeader zfh; ++ ZSTD_FrameHeader zfh; + size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(err)) return err; + RETURN_ERROR_IF(err>0, srcSize_wrong, ""); +@@ -1888,6 +2019,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea + U32 someMoreWork = 1; + + DEBUGLOG(5, "ZSTD_decompressStream"); ++ assert(zds != NULL); + RETURN_ERROR_IF( + input->pos > input->size, + srcSize_wrong, +@@ -1918,7 +2050,6 @@ size_t ZSTD_decompressStream(ZSTD_DStrea if (zds->refMultipleDDicts && zds->ddictSet) { ZSTD_DCtx_selectFrameDDict(zds); } @@ -16804,7 +21520,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (ZSTD_isError(hSize)) { return hSize; /* error */ } -@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -1932,6 +2063,11 @@ size_t ZSTD_decompressStream(ZSTD_DStrea zds->lhSize += remainingInput; } input->pos = input->size; @@ -16816,7 +21532,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ } assert(ip != NULL); -@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -1943,14 +2079,15 @@ size_t ZSTD_decompressStream(ZSTD_DStrea if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN && zds->fParams.frameType != ZSTD_skippableFrame && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { @@ -16835,7 +21551,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> zds->expected = 0; zds->streamStage = zdss_init; someMoreWork = 0; -@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -1969,7 +2106,8 @@ size_t ZSTD_decompressStream(ZSTD_DStrea DEBUGLOG(4, "Consume header"); FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); @@ -16845,7 +21561,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); zds->stage = ZSTDds_skipFrame; } else { -@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -1985,11 +2123,13 @@ size_t ZSTD_decompressStream(ZSTD_DStrea zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, frameParameter_windowTooLarge, ""); @@ -16860,7 +21576,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> : 0; ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); -@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -2034,6 +2174,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea } if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); @@ -16868,7 +21584,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> ip += neededInSize; /* Function modifies the stage so we must break */ break; -@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -2048,7 +2189,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea int const isSkipFrame = ZSTD_isSkipFrame(zds); size_t loadedSize; /* At this point we shouldn't be decompressing a block that we can stream. */ @@ -16877,7 +21593,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (isSkipFrame) { loadedSize = MIN(toLoad, (size_t)(iend-ip)); } else { -@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -2057,8 +2198,11 @@ size_t ZSTD_decompressStream(ZSTD_DStrea "should never happen"); loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); } @@ -16891,7 +21607,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ /* decode loaded input */ -@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -2068,14 +2212,17 @@ size_t ZSTD_decompressStream(ZSTD_DStrea break; } case zdss_flush: @@ -16912,7 +21628,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", (int)(zds->outBuffSize - zds->outStart), (U32)zds->fParams.blockSizeMax); -@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -2089,7 +2236,7 @@ size_t ZSTD_decompressStream(ZSTD_DStrea default: assert(0); /* impossible */ @@ -16921,7 +21637,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } } /* result */ -@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStrea +@@ -2102,8 +2249,8 @@ size_t ZSTD_decompressStream(ZSTD_DStrea if ((ip==istart) && (op==ostart)) { /* no forward progress */ zds->noForwardProgress ++; if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { @@ -16932,7 +21648,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> assert(0); } } else { -@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs +@@ -2140,11 +2287,17 @@ size_t ZSTD_decompressStream_simpleArgs void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos) { @@ -17076,10 +21792,12 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ void* dst, size_t dstCapacity, const streaming_operation streaming) { -@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt +@@ -124,7 +140,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); { const BYTE* const istart = (const BYTE*) src; - symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); +- symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); ++ SymbolEncodingType_e const litEncType = (SymbolEncodingType_e)(istart[0] & 3); + size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); switch(litEncType) @@ -17240,6 +21958,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* Default FSE distribution tables. * These are pre-calculated FSE decoding tables using default distributions as defined in specification : * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions +@@ -317,7 +359,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCt + * - start from default distributions, present in /lib/common/zstd_internal.h + * - generate tables normally, using ZSTD_buildFSETable() + * - printout the content of tables +- * - pretify output, report below, test with fuzzer to ensure it's correct */ ++ * - prettify output, report below, test with fuzzer to ensure it's correct */ + + /* Default FSE distribution table for Literal Lengths */ + static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = { @@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSym for (i = 8; i < n; i += 8) { MEM_write64(spread + pos + i, sv); @@ -17277,6 +22004,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); assert(nbAdditionalBits[symbol] < 255); tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol]; +@@ -603,7 +646,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* + * @return : nb bytes read from src, + * or an error code if it fails */ + static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, +- symbolEncodingType_e type, unsigned max, U32 maxLog, ++ SymbolEncodingType_e type, unsigned max, U32 maxLog, + const void* src, size_t srcSize, + const U32* baseValue, const U8* nbAdditionalBits, + const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, @@ -664,11 +707,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* /* SeqHead */ @@ -17289,7 +22025,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (nbSeq > 0x7F) { if (nbSeq == 0xFF) { RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); -@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* +@@ -681,11 +719,19 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* } *nbSeqPtr = nbSeq; @@ -17302,10 +22038,16 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> + /* FSE table descriptors */ RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ +- { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); +- symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); +- symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); + RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ - { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); - symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); - symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); ++ { SymbolEncodingType_e const LLtype = (SymbolEncodingType_e)(*ip >> 6); ++ SymbolEncodingType_e const OFtype = (SymbolEncodingType_e)((*ip >> 4) & 3); ++ SymbolEncodingType_e const MLtype = (SymbolEncodingType_e)((*ip >> 2) & 3); + ip++; + + /* Build DTables */ @@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, cons /* ZSTD_safecopyDstBeforeSrc(): * This version allows overlap with dst before src, or handles the non-overlap case with dst after src @@ -18055,7 +22797,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ -@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t +@@ -1873,50 +1928,40 @@ static BMI2_TARGET_ATTRIBUTE size_t ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, @@ -18068,14 +22810,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ -@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequence - ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, + #endif /* DYNAMIC_BMI2 */ + +-typedef size_t (*ZSTD_decompressSequences_t)( +- ZSTD_DCtx* dctx, +- void* dst, size_t maxDstSize, +- const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset, - const int frame); -+ const ZSTD_longOffset_e isLongOffset); - +- #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG static size_t ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, @@ -18113,7 +22856,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ -@@ -1931,69 +1982,114 @@ static size_t +@@ -1931,69 +1976,114 @@ static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, @@ -18143,15 +22886,15 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> +{ + return (size_t)(op - virtualStart); +} -+ -+typedef struct { -+ unsigned longOffsetShare; -+ unsigned maxNbAdditionalBits; -+} ZSTD_OffsetInfo; -#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ - !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) -/* ZSTD_getLongOffsetsShare() : ++typedef struct { ++ unsigned longOffsetShare; ++ unsigned maxNbAdditionalBits; ++} ZSTD_OffsetInfo; ++ +/* ZSTD_getOffsetInfo() : * condition : offTable must be valid * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) @@ -18264,7 +23007,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; srcSize -= litCSize; -@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* +@@ -2001,6 +2091,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* /* Build Decoding Tables */ { @@ -18288,7 +23031,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* These macros control at build-time which decompressor implementation * we use. If neither is defined, we do some inspection and dispatch at * runtime. -@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* +@@ -2008,6 +2115,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) int usePrefetchDecoder = dctx->ddictIsCold; @@ -18300,7 +23043,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> #endif int nbSeq; size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); -@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* +@@ -2015,40 +2127,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* ip += seqHSize; srcSize -= seqHSize; @@ -18371,7 +23114,7 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) { if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ -@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dct +@@ -2060,13 +2187,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dct } @@ -18455,14 +23198,25 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> U32 rep[ZSTD_REP_NUM]; U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; } ZSTD_entropyDTables_t; -@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s +@@ -135,7 +137,7 @@ struct ZSTD_DCtx_s + const void* virtualStart; /* virtual start of previous segment if it was just before current one */ + const void* dictEnd; /* end of previous segment */ + size_t expected; +- ZSTD_frameHeader fParams; ++ ZSTD_FrameHeader fParams; + U64 processedCSize; + U64 decodedSize; + blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */ +@@ -152,7 +154,8 @@ struct ZSTD_DCtx_s size_t litSize; size_t rleSize; size_t staticSize; +-#if DYNAMIC_BMI2 != 0 + int isFrameDecompression; - #if DYNAMIC_BMI2 != 0 ++#if DYNAMIC_BMI2 int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ #endif + @@ -164,6 +167,8 @@ struct ZSTD_DCtx_s ZSTD_dictUses_e dictUses; ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ @@ -18472,6 +23226,21 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> /* streaming */ ZSTD_dStreamStage streamStage; +@@ -199,11 +204,11 @@ struct ZSTD_DCtx_s + }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ + + MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) { +-#if DYNAMIC_BMI2 != 0 +- return dctx->bmi2; ++#if DYNAMIC_BMI2 ++ return dctx->bmi2; + #else + (void)dctx; +- return 0; ++ return 0; + #endif + } + --- a/lib/zstd/decompress_sources.h +++ b/lib/zstd/decompress_sources.h @@ -1,6 +1,6 @@ @@ -18512,6 +23281,106 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the +@@ -16,6 +16,7 @@ + + #include "common/zstd_deps.h" + #include "common/zstd_internal.h" ++#include "compress/zstd_compress_internal.h" + + #define ZSTD_FORWARD_IF_ERR(ret) \ + do { \ +@@ -85,6 +86,12 @@ zstd_parameters zstd_get_params(int leve + } + EXPORT_SYMBOL(zstd_get_params); + ++size_t zstd_cctx_set_param(zstd_cctx *cctx, ZSTD_cParameter param, int value) ++{ ++ return ZSTD_CCtx_setParameter(cctx, param, value); ++} ++EXPORT_SYMBOL(zstd_cctx_set_param); ++ + zstd_compression_parameters zstd_get_cparams(int level, + unsigned long long estimated_src_size, size_t dict_size) + { +@@ -98,6 +105,52 @@ size_t zstd_cctx_workspace_bound(const z + } + EXPORT_SYMBOL(zstd_cctx_workspace_bound); + ++// Used by zstd_cctx_workspace_bound_with_ext_seq_prod() ++static size_t dummy_external_sequence_producer( ++ void *sequenceProducerState, ++ ZSTD_Sequence *outSeqs, size_t outSeqsCapacity, ++ const void *src, size_t srcSize, ++ const void *dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize) ++{ ++ (void)sequenceProducerState; ++ (void)outSeqs; (void)outSeqsCapacity; ++ (void)src; (void)srcSize; ++ (void)dict; (void)dictSize; ++ (void)compressionLevel; ++ (void)windowSize; ++ return ZSTD_SEQUENCE_PRODUCER_ERROR; ++} ++ ++static void init_cctx_params_from_compress_params( ++ ZSTD_CCtx_params *cctx_params, ++ const zstd_compression_parameters *compress_params) ++{ ++ ZSTD_parameters zstd_params; ++ memset(&zstd_params, 0, sizeof(zstd_params)); ++ zstd_params.cParams = *compress_params; ++ ZSTD_CCtxParams_init_advanced(cctx_params, zstd_params); ++} ++ ++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params) ++{ ++ ZSTD_CCtx_params cctx_params; ++ init_cctx_params_from_compress_params(&cctx_params, compress_params); ++ ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer); ++ return ZSTD_estimateCCtxSize_usingCCtxParams(&cctx_params); ++} ++EXPORT_SYMBOL(zstd_cctx_workspace_bound_with_ext_seq_prod); ++ ++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params) ++{ ++ ZSTD_CCtx_params cctx_params; ++ init_cctx_params_from_compress_params(&cctx_params, compress_params); ++ ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer); ++ return ZSTD_estimateCStreamSize_usingCCtxParams(&cctx_params); ++} ++EXPORT_SYMBOL(zstd_cstream_workspace_bound_with_ext_seq_prod); ++ + zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size) + { + if (workspace == NULL) +@@ -209,5 +262,25 @@ size_t zstd_end_stream(zstd_cstream *cst + } + EXPORT_SYMBOL(zstd_end_stream); + ++void zstd_register_sequence_producer( ++ zstd_cctx *cctx, ++ void* sequence_producer_state, ++ zstd_sequence_producer_f sequence_producer ++) { ++ ZSTD_registerSequenceProducer(cctx, sequence_producer_state, sequence_producer); ++} ++EXPORT_SYMBOL(zstd_register_sequence_producer); ++ ++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity, ++ const zstd_sequence *in_seqs, size_t in_seqs_size, ++ const void* literals, size_t lit_size, size_t lit_capacity, ++ size_t decompressed_size) ++{ ++ return ZSTD_compressSequencesAndLiterals(cctx, dst, dst_capacity, in_seqs, ++ in_seqs_size, literals, lit_size, ++ lit_capacity, decompressed_size); ++} ++EXPORT_SYMBOL(zstd_compress_sequences_and_literals); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Compressor"); --- a/lib/zstd/zstd_decompress_module.c +++ b/lib/zstd/zstd_decompress_module.c @@ -1,6 +1,6 @@ diff --git a/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch b/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch index 2cd625d..b6e7bb3 100644 --- a/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch +++ b/debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch @@ -1,4 +1,4 @@ -From 3dacd15edbed579d6966a884ce04aab95f1dfdeb Mon Sep 17 00:00:00 2001 +From 0df7cc91ac0a3e84f2e0aeec1a71cd737de41b8a Mon Sep 17 00:00:00 2001 From: Kees Cook <keescook@chromium.org> Date: Mon, 22 Jan 2024 16:27:56 -0800 Subject: lib: zstd: Refactor intentional wrap-around test @@ -38,7 +38,7 @@ Signed-off-by: Kees Cook <keescook@chromium.org> --- a/lib/zstd/decompress/zstd_decompress.c +++ b/lib/zstd/decompress/zstd_decompress.c -@@ -618,7 +618,7 @@ size_t ZSTD_readSkippableFrame(void* dst +@@ -620,7 +620,7 @@ size_t ZSTD_readSkippableFrame(void* dst * @return : decompressed size of the frames contained */ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) { @@ -47,7 +47,7 @@ Signed-off-by: Kees Cook <keescook@chromium.org> while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { U32 const magicNumber = MEM_readLE32(src); -@@ -636,7 +636,7 @@ unsigned long long ZSTD_findDecompressed +@@ -638,7 +638,7 @@ unsigned long long ZSTD_findDecompressed { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; diff --git a/debian/patches/patchset-xanmod/amd/0001-platform-x86-amd-amd_3d_vcache-Add-AMD-3D-V-Cache-op.patch b/debian/patches/patchset-xanmod/amd/0001-platform-x86-amd-amd_3d_vcache-Add-AMD-3D-V-Cache-op.patch deleted file mode 100644 index 9ee1fb7..0000000 --- a/debian/patches/patchset-xanmod/amd/0001-platform-x86-amd-amd_3d_vcache-Add-AMD-3D-V-Cache-op.patch +++ /dev/null @@ -1,266 +0,0 @@ -From d2589889bf6001daef33479d29680fa3f991fae9 Mon Sep 17 00:00:00 2001 -From: Basavaraj Natikar <Basavaraj.Natikar@amd.com> -Date: Tue, 12 Nov 2024 22:33:06 +0530 -Subject: [PATCH 1/2] platform/x86/amd: amd_3d_vcache: Add AMD 3D V-Cache - optimizer driver -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -AMD X3D processors, also known as AMD 3D V-Cache, feature dual Core -Complex Dies (CCDs) and enlarged L3 cache, enabling dynamic mode -switching between Frequency and Cache modes. To optimize performance, -implement the AMD 3D V-Cache Optimizer, which allows selecting either: - -Frequency mode: cores within the faster CCD are prioritized before -those in the slower CCD. - -Cache mode: cores within the larger L3 CCD are prioritized before -those in the smaller L3 CCD. - -Co-developed-by: Perry Yuan <perry.yuan@amd.com> -Signed-off-by: Perry Yuan <perry.yuan@amd.com> -Co-developed-by: Mario Limonciello <mario.limonciello@amd.com> -Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> -Reviewed-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com> -Reviewed-by: Armin Wolf <W_Armin@gmx.de> -Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com> -Link: https://lore.kernel.org/r/20241112170307.3745777-2-Basavaraj.Natikar@amd.com -Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com> -Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com> -Signed-off-by: Alexandre Frade <kernel@xanmod.org> ---- - MAINTAINERS | 7 + - drivers/platform/x86/amd/Kconfig | 12 ++ - drivers/platform/x86/amd/Makefile | 2 + - drivers/platform/x86/amd/x3d_vcache.c | 176 ++++++++++++++++++++++++++ - 4 files changed, 197 insertions(+) - create mode 100644 drivers/platform/x86/amd/x3d_vcache.c - ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -965,6 +965,13 @@ Q: https://patchwork.kernel.org/project/ - F: drivers/infiniband/hw/efa/ - F: include/uapi/rdma/efa-abi.h - -+AMD 3D V-CACHE PERFORMANCE OPTIMIZER DRIVER -+M: Basavaraj Natikar <Basavaraj.Natikar@amd.com> -+R: Mario Limonciello <mario.limonciello@amd.com> -+L: platform-driver-x86@vger.kernel.org -+S: Supported -+F: drivers/platform/x86/amd/x3d_vcache.c -+ - AMD ADDRESS TRANSLATION LIBRARY (ATL) - M: Yazen Ghannam <Yazen.Ghannam@amd.com> - L: linux-edac@vger.kernel.org ---- a/drivers/platform/x86/amd/Kconfig -+++ b/drivers/platform/x86/amd/Kconfig -@@ -19,6 +19,18 @@ config AMD_HSMP - If you choose to compile this driver as a module the module will be - called amd_hsmp. - -+config AMD_3D_VCACHE -+ tristate "AMD 3D V-Cache Performance Optimizer Driver" -+ depends on X86_64 && ACPI -+ help -+ The driver provides a sysfs interface, enabling the setting of a bias -+ that alters CPU core reordering. This bias prefers cores with higher -+ frequencies or larger L3 caches on processors supporting AMD 3D V-Cache -+ technology. -+ -+ If you choose to compile this driver as a module the module will be -+ called amd_3d_vcache. -+ - config AMD_WBRF - bool "AMD Wifi RF Band mitigations (WBRF)" - depends on ACPI ---- a/drivers/platform/x86/amd/Makefile -+++ b/drivers/platform/x86/amd/Makefile -@@ -4,6 +4,8 @@ - # AMD x86 Platform-Specific Drivers - # - -+obj-$(CONFIG_AMD_3D_VCACHE) += amd_3d_vcache.o -+amd_3d_vcache-objs := x3d_vcache.o - obj-$(CONFIG_AMD_PMC) += pmc/ - amd_hsmp-y := hsmp.o - obj-$(CONFIG_AMD_HSMP) += amd_hsmp.o ---- /dev/null -+++ b/drivers/platform/x86/amd/x3d_vcache.c -@@ -0,0 +1,176 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+/* -+ * AMD 3D V-Cache Performance Optimizer Driver -+ * -+ * Copyright (c) 2024, Advanced Micro Devices, Inc. -+ * All Rights Reserved. -+ * -+ * Authors: Basavaraj Natikar <Basavaraj.Natikar@amd.com> -+ * Perry Yuan <perry.yuan@amd.com> -+ * Mario Limonciello <mario.limonciello@amd.com> -+ */ -+ -+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -+ -+#include <linux/acpi.h> -+#include <linux/array_size.h> -+#include <linux/device.h> -+#include <linux/errno.h> -+#include <linux/module.h> -+#include <linux/mutex.h> -+#include <linux/platform_device.h> -+#include <linux/pm.h> -+#include <linux/sysfs.h> -+#include <linux/uuid.h> -+ -+static char *x3d_mode = "frequency"; -+module_param(x3d_mode, charp, 0); -+MODULE_PARM_DESC(x3d_mode, "Initial 3D-VCache mode; 'frequency' (default) or 'cache'"); -+ -+#define DSM_REVISION_ID 0 -+#define DSM_SET_X3D_MODE 1 -+ -+static guid_t x3d_guid = GUID_INIT(0xdff8e55f, 0xbcfd, 0x46fb, 0xba, 0x0a, -+ 0xef, 0xd0, 0x45, 0x0f, 0x34, 0xee); -+ -+enum amd_x3d_mode_type { -+ MODE_INDEX_FREQ, -+ MODE_INDEX_CACHE, -+}; -+ -+static const char * const amd_x3d_mode_strings[] = { -+ [MODE_INDEX_FREQ] = "frequency", -+ [MODE_INDEX_CACHE] = "cache", -+}; -+ -+struct amd_x3d_dev { -+ struct device *dev; -+ acpi_handle ahandle; -+ /* To protect x3d mode setting */ -+ struct mutex lock; -+ enum amd_x3d_mode_type curr_mode; -+}; -+ -+static int amd_x3d_get_mode(struct amd_x3d_dev *data) -+{ -+ guard(mutex)(&data->lock); -+ -+ return data->curr_mode; -+} -+ -+static int amd_x3d_mode_switch(struct amd_x3d_dev *data, int new_state) -+{ -+ union acpi_object *out, argv; -+ -+ guard(mutex)(&data->lock); -+ argv.type = ACPI_TYPE_INTEGER; -+ argv.integer.value = new_state; -+ -+ out = acpi_evaluate_dsm(data->ahandle, &x3d_guid, DSM_REVISION_ID, -+ DSM_SET_X3D_MODE, &argv); -+ if (!out) { -+ dev_err(data->dev, "failed to evaluate _DSM\n"); -+ return -EINVAL; -+ } -+ -+ data->curr_mode = new_state; -+ -+ kfree(out); -+ -+ return 0; -+} -+ -+static ssize_t amd_x3d_mode_store(struct device *dev, struct device_attribute *attr, -+ const char *buf, size_t count) -+{ -+ struct amd_x3d_dev *data = dev_get_drvdata(dev); -+ int ret; -+ -+ ret = sysfs_match_string(amd_x3d_mode_strings, buf); -+ if (ret < 0) -+ return ret; -+ -+ ret = amd_x3d_mode_switch(data, ret); -+ if (ret < 0) -+ return ret; -+ -+ return count; -+} -+ -+static ssize_t amd_x3d_mode_show(struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ struct amd_x3d_dev *data = dev_get_drvdata(dev); -+ int mode = amd_x3d_get_mode(data); -+ -+ return sysfs_emit(buf, "%s\n", amd_x3d_mode_strings[mode]); -+} -+static DEVICE_ATTR_RW(amd_x3d_mode); -+ -+static struct attribute *amd_x3d_attrs[] = { -+ &dev_attr_amd_x3d_mode.attr, -+ NULL -+}; -+ATTRIBUTE_GROUPS(amd_x3d); -+ -+static int amd_x3d_resume_handler(struct device *dev) -+{ -+ struct amd_x3d_dev *data = dev_get_drvdata(dev); -+ int ret = amd_x3d_get_mode(data); -+ -+ return amd_x3d_mode_switch(data, ret); -+} -+ -+static DEFINE_SIMPLE_DEV_PM_OPS(amd_x3d_pm, NULL, amd_x3d_resume_handler); -+ -+static const struct acpi_device_id amd_x3d_acpi_ids[] = { -+ {"AMDI0101"}, -+ { }, -+}; -+MODULE_DEVICE_TABLE(acpi, amd_x3d_acpi_ids); -+ -+static int amd_x3d_probe(struct platform_device *pdev) -+{ -+ struct amd_x3d_dev *data; -+ acpi_handle handle; -+ int ret; -+ -+ handle = ACPI_HANDLE(&pdev->dev); -+ if (!handle) -+ return -ENODEV; -+ -+ if (!acpi_check_dsm(handle, &x3d_guid, DSM_REVISION_ID, BIT(DSM_SET_X3D_MODE))) -+ return -ENODEV; -+ -+ data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); -+ if (!data) -+ return -ENOMEM; -+ -+ data->dev = &pdev->dev; -+ -+ ret = devm_mutex_init(data->dev, &data->lock); -+ if (ret) -+ return ret; -+ -+ data->ahandle = handle; -+ platform_set_drvdata(pdev, data); -+ -+ ret = match_string(amd_x3d_mode_strings, ARRAY_SIZE(amd_x3d_mode_strings), x3d_mode); -+ if (ret < 0) -+ return dev_err_probe(&pdev->dev, -EINVAL, "invalid mode %s\n", x3d_mode); -+ -+ return amd_x3d_mode_switch(data, ret); -+} -+ -+static struct platform_driver amd_3d_vcache_driver = { -+ .driver = { -+ .name = "amd_x3d_vcache", -+ .dev_groups = amd_x3d_groups, -+ .acpi_match_table = amd_x3d_acpi_ids, -+ .pm = pm_sleep_ptr(&amd_x3d_pm), -+ }, -+ .probe = amd_x3d_probe, -+}; -+module_platform_driver(amd_3d_vcache_driver); -+ -+MODULE_DESCRIPTION("AMD 3D V-Cache Performance Optimizer Driver"); -+MODULE_LICENSE("GPL"); diff --git a/debian/patches/patchset-xanmod/amd/0002-platform-x86-amd-amd_3d_vcache-Add-sysfs-ABI-documen.patch b/debian/patches/patchset-xanmod/amd/0002-platform-x86-amd-amd_3d_vcache-Add-sysfs-ABI-documen.patch deleted file mode 100644 index 196157c..0000000 --- a/debian/patches/patchset-xanmod/amd/0002-platform-x86-amd-amd_3d_vcache-Add-sysfs-ABI-documen.patch +++ /dev/null @@ -1,55 +0,0 @@ -From edf899b17950e1b926889b501e06c86dd867bac0 Mon Sep 17 00:00:00 2001 -From: Basavaraj Natikar <Basavaraj.Natikar@amd.com> -Date: Tue, 12 Nov 2024 22:33:07 +0530 -Subject: [PATCH 2/2] platform/x86/amd: amd_3d_vcache: Add sysfs ABI - documentation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Add documentation for the amd_3d_vcache sysfs bus platform driver -interface so that userspace applications can use it to change mode -preferences, either frequency or cache. - -Co-developed-by: Perry Yuan <perry.yuan@amd.com> -Signed-off-by: Perry Yuan <perry.yuan@amd.com> -Co-developed-by: Mario Limonciello <mario.limonciello@amd.com> -Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> -Reviewed-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com> -Reviewed-by: Armin Wolf <W_Armin@gmx.de> -Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com> -Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com> -Link: https://lore.kernel.org/r/20241112170307.3745777-3-Basavaraj.Natikar@amd.com -Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com> -Signed-off-by: Alexandre Frade <kernel@xanmod.org> ---- - .../sysfs-bus-platform-drivers-amd_x3d_vcache | 12 ++++++++++++ - MAINTAINERS | 1 + - 2 files changed, 13 insertions(+) - create mode 100644 Documentation/ABI/testing/sysfs-bus-platform-drivers-amd_x3d_vcache - ---- /dev/null -+++ b/Documentation/ABI/testing/sysfs-bus-platform-drivers-amd_x3d_vcache -@@ -0,0 +1,12 @@ -+What: /sys/bus/platform/drivers/amd_x3d_vcache/AMDI0101:00/amd_x3d_mode -+Date: November 2024 -+KernelVersion: 6.13 -+Contact: Basavaraj Natikar <Basavaraj.Natikar@amd.com> -+Description: (RW) AMD 3D V-Cache optimizer allows users to switch CPU core -+ rankings dynamically. -+ -+ This file switches between these two modes: -+ - "frequency" cores within the faster CCD are prioritized before -+ those in the slower CCD. -+ - "cache" cores within the larger L3 CCD are prioritized before -+ those in the smaller L3 CCD. ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -970,6 +970,7 @@ M: Basavaraj Natikar <Basavaraj.Natikar@ - R: Mario Limonciello <mario.limonciello@amd.com> - L: platform-driver-x86@vger.kernel.org - S: Supported -+F: Documentation/ABI/testing/sysfs-bus-platform-drivers-amd_x3d_vcache - F: drivers/platform/x86/amd/x3d_vcache.c - - AMD ADDRESS TRANSLATION LIBRARY (ATL) diff --git a/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch b/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch index c6a559a..86b1022 100644 --- a/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch +++ b/debian/patches/patchset-xanmod/binder/0001-binder-turn-into-module.patch @@ -1,7 +1,7 @@ -From b492213c96ded86e7800b320706ad15bd31c7c1b Mon Sep 17 00:00:00 2001 +From ae8cebfd2446a0564c849adcd771ce538855b6b2 Mon Sep 17 00:00:00 2001 From: Christian Brauner <christian@brauner.io> Date: Wed, 16 Jan 2019 23:13:25 +0100 -Subject: [PATCH 1/4] binder: turn into module +Subject: binder: turn into module The Android binder driver needs to become a module for the sake of shipping Anbox. To do this we need to export the following functions since binder is @@ -29,6 +29,7 @@ Signed-off-by: Seth Forshee <seth.forshee@canonical.com> [ arighi: zap_page_range() has been dropped, export zap_page_range_single() in 6.3 ] Signed-off-by: Andrea Righi <andrea.righi@canonical.com> Signed-off-by: Alexandre Frade <kernel@xanmod.org> +--- --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -45,7 +46,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> Binderfs is a pseudo-filesystem for the Android Binder IPC driver --- a/drivers/android/binder.c +++ b/drivers/android/binder.c -@@ -7027,9 +7027,20 @@ err_alloc_device_names_failed: +@@ -7031,9 +7031,20 @@ err_alloc_device_names_failed: return ret; } @@ -79,7 +80,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> #include <linux/rbtree.h> #include <linux/list.h> #include <linux/mm.h> -@@ -111,7 +112,7 @@ struct binder_alloc { +@@ -120,7 +121,7 @@ struct binder_alloc { bool oneway_spam_detected; }; @@ -98,7 +99,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/mutex.h> -@@ -78,7 +79,7 @@ extern const struct file_operations bind +@@ -77,7 +78,7 @@ extern const struct file_operations bind extern char *binder_devices_param; @@ -107,7 +108,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> extern bool is_binderfs_device(const struct inode *inode); extern struct dentry *binderfs_create_file(struct dentry *dir, const char *name, const struct file_operations *fops, -@@ -99,7 +100,7 @@ static inline struct dentry *binderfs_cr +@@ -98,7 +99,7 @@ static inline struct dentry *binderfs_cr static inline void binderfs_remove_file(struct dentry *dentry) {} #endif @@ -127,7 +128,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> #else bool use_reserve = true; #endif -@@ -399,7 +399,7 @@ static int binderfs_binder_ctl_create(st +@@ -402,7 +402,7 @@ static int binderfs_binder_ctl_create(st struct dentry *root = sb->s_root; struct binderfs_info *info = sb->s_fs_info; #if defined(CONFIG_IPC_NS) @@ -136,7 +137,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> #else bool use_reserve = true; #endif -@@ -691,7 +691,7 @@ static int binderfs_fill_super(struct su +@@ -694,7 +694,7 @@ static int binderfs_fill_super(struct su return -ENOMEM; info = sb->s_fs_info; @@ -184,7 +185,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> return container_of(ns, struct ipc_namespace, ns); --- a/mm/vmalloc.c +++ b/mm/vmalloc.c -@@ -3176,6 +3176,7 @@ struct vm_struct *get_vm_area(unsigned l +@@ -3181,6 +3181,7 @@ struct vm_struct *get_vm_area(unsigned l NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); } diff --git a/debian/patches/patchset-xanmod/binder/0002-binder-turn-into-module-list_lru_add-list_lru_del.patch b/debian/patches/patchset-xanmod/binder/0002-binder-turn-into-module-list_lru_add-list_lru_del.patch new file mode 100644 index 0000000..0807677 --- /dev/null +++ b/debian/patches/patchset-xanmod/binder/0002-binder-turn-into-module-list_lru_add-list_lru_del.patch @@ -0,0 +1,29 @@ +From 0156792aef65a27c5938dc821630f5546dc6a3c9 Mon Sep 17 00:00:00 2001 +From: Paolo Pisati <paolo.pisati@canonical.com> +Date: Thu, 6 Feb 2025 15:38:05 +0100 +Subject: binder: turn into module - list_lru_add()/list_lru_del() + +Signed-off-by: Paolo Pisati <paolo.pisati@canonical.com> +Signed-off-by: Alexandre Frade <kernel@xanmod.org> +--- + mm/list_lru.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/mm/list_lru.c ++++ b/mm/list_lru.c +@@ -175,6 +175,7 @@ bool list_lru_add(struct list_lru *lru, + unlock_list_lru(l, false); + return false; + } ++EXPORT_SYMBOL_GPL(list_lru_add); + + bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) + { +@@ -212,6 +213,7 @@ bool list_lru_del(struct list_lru *lru, + unlock_list_lru(l, false); + return false; + } ++EXPORT_SYMBOL_GPL(list_lru_del); + + bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) + { diff --git a/debian/patches/patchset-xanmod/binder/0003-binder-turn-into-module-lock_vma_under_rcu.patch b/debian/patches/patchset-xanmod/binder/0003-binder-turn-into-module-lock_vma_under_rcu.patch new file mode 100644 index 0000000..0d9ab9f --- /dev/null +++ b/debian/patches/patchset-xanmod/binder/0003-binder-turn-into-module-lock_vma_under_rcu.patch @@ -0,0 +1,21 @@ +From 51d6dcc335e157df9ce5b9605841b879db64894a Mon Sep 17 00:00:00 2001 +From: Paolo Pisati <paolo.pisati@canonical.com> +Date: Thu, 6 Feb 2025 15:40:09 +0100 +Subject: binder: turn into module - lock_vma_under_rcu() + +Signed-off-by: Paolo Pisati <paolo.pisati@canonical.com> +Signed-off-by: Alexandre Frade <kernel@xanmod.org> +--- + mm/memory.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -6395,6 +6395,7 @@ inval: + count_vm_vma_lock_event(VMA_LOCK_ABORT); + return NULL; + } ++EXPORT_SYMBOL_GPL(lock_vma_under_rcu); + #endif /* CONFIG_PER_VMA_LOCK */ + + #ifndef __PAGETABLE_P4D_FOLDED diff --git a/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch b/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch index ec8edd9..1015e19 100644 --- a/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch +++ b/debian/patches/patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch @@ -1,7 +1,7 @@ -From cdcc9fde68f01d86d8f9ff0baaf0e9fbd15fa8ba Mon Sep 17 00:00:00 2001 +From fa6cddbfd7915ed81dcbed99f9e5b5a9267d80a3 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven <arjan@linux.intel.com> Date: Thu, 13 Dec 2018 01:00:49 +0000 -Subject: [PATCH 1/4] sched/wait: Do accept() in LIFO order for cache +Subject: sched/wait: Do accept() in LIFO order for cache efficiency Signed-off-by: Alexandre Frade <kernel@xanmod.org> diff --git a/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch b/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch index c4989b8..1085f44 100644 --- a/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch +++ b/debian/patches/patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch @@ -1,7 +1,7 @@ -From c6f8d4723c8185d7096cdea7f5e499184f22426e Mon Sep 17 00:00:00 2001 +From b837910f5e9f1928872e600a6835be6d422b761b Mon Sep 17 00:00:00 2001 From: William Douglas <william.douglas@intel.com> Date: Wed, 20 Jun 2018 17:23:21 +0000 -Subject: [PATCH 2/4] firmware: Enable stateless firmware loading +Subject: firmware: Enable stateless firmware loading Prefer the order of specific version before generic and /etc before /lib to enable the user to give specific overrides for generic diff --git a/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch b/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch index 56bb85e..43d792a 100644 --- a/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch +++ b/debian/patches/patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch @@ -1,7 +1,7 @@ -From 78a04a7536d68fa0d8e7dc2955d37aa7f592fca5 Mon Sep 17 00:00:00 2001 +From 274ba9c23b6fe3212c7f02f3e833086427034705 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven <arjan@linux.intel.com> Date: Sun, 18 Feb 2018 23:35:41 +0000 -Subject: [PATCH 3/4] locking: rwsem: spin faster +Subject: locking: rwsem: spin faster tweak rwsem owner spinning a bit diff --git a/debian/patches/patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch b/debian/patches/patchset-xanmod/clearlinux/0004-drivers-initialize-ata-before-graphics.patch similarity index 87% rename from debian/patches/patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch rename to debian/patches/patchset-xanmod/clearlinux/0004-drivers-initialize-ata-before-graphics.patch index fc382cc..db05acc 100644 --- a/debian/patches/patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch +++ b/debian/patches/patchset-xanmod/clearlinux/0004-drivers-initialize-ata-before-graphics.patch @@ -1,11 +1,13 @@ -From 48d2ea8801ccf8bd9cd48c12fce79040bbcae363 Mon Sep 17 00:00:00 2001 +From 0234467781c5b1c50f71f3936571e4ea3e77c279 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven <arjan@linux.intel.com> Date: Thu, 2 Jun 2016 23:36:32 -0500 -Subject: ZEN: Initialize ata before graphics +Subject: drivers: initialize ata before graphics ATA init is the long pole in the boot process, and its asynchronous. move the graphics init after it so that ata and graphics initialize in parallel + +Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- drivers/Makefile | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch b/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch index 6b75adc..7aa30cf 100644 --- a/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch +++ b/debian/patches/patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch @@ -1,7 +1,7 @@ From 2099f9c57216c836e445d2f6ba65f04131267f47 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Mon, 27 Feb 2023 01:38:18 +0000 -Subject: [PATCH 1/2] netfilter: Add netfilter nf_tables fullcone support +Subject: netfilter: Add netfilter nf_tables fullcone support Signed-off-by: Syrone Wong <wong.syrone@gmail.com> Signed-off-by: Alexandre Frade <kernel@xanmod.org> diff --git a/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch b/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch index c165b36..81445dc 100644 --- a/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch +++ b/debian/patches/patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch @@ -1,7 +1,7 @@ From 6fbfabdc4e5ef8a186c27e4ed2db28ee1ddf4b4e Mon Sep 17 00:00:00 2001 From: Felix Fietkau <nbd@nbd.name> Date: Tue, 20 Feb 2018 15:56:02 +0100 -Subject: [PATCH 2/2] netfilter: add xt_FLOWOFFLOAD target +Subject: netfilter: add xt_FLOWOFFLOAD target Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: Alexandre Frade <kernel@xanmod.org> @@ -18,7 +18,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h -@@ -294,6 +294,11 @@ void nf_flow_table_free(struct nf_flowta +@@ -295,6 +295,11 @@ void nf_flow_table_free(struct nf_flowta void flow_offload_teardown(struct flow_offload *flow); @@ -88,7 +88,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> -@@ -373,8 +372,7 @@ flow_offload_lookup(struct nf_flowtable +@@ -413,8 +412,7 @@ flow_offload_lookup(struct nf_flowtable } EXPORT_SYMBOL_GPL(flow_offload_lookup); @@ -98,7 +98,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> void (*iter)(struct nf_flowtable *flowtable, struct flow_offload *flow, void *data), void *data) -@@ -435,6 +433,7 @@ static void nf_flow_offload_gc_step(stru +@@ -580,6 +578,7 @@ static void nf_flow_offload_gc_step(stru nf_flow_offload_stats(flow_table, flow); } } diff --git a/debian/patches/misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch similarity index 89% rename from debian/patches/misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch index 540f013..88950f9 100644 --- a/debian/patches/misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch @@ -1,7 +1,7 @@ -From b9540ffedb31e687585b586b9f96543928f6b99b Mon Sep 17 00:00:00 2001 +From 5435b92688a57d175607374d5bbff357e4ba3e71 Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Tue, 11 Jun 2019 12:26:55 -0400 -Subject: [PATCH 01/19] net-tcp_bbr: broaden app-limited rate sample detection +Subject: net-tcp_bbr: broaden app-limited rate sample detection This commit is a bug fix for the Linux TCP app-limited (application-limited) logic that is used for collecting rate @@ -42,7 +42,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> * is in window. --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c -@@ -690,6 +690,7 @@ void tcp_write_timer_handler(struct sock +@@ -699,6 +699,7 @@ void tcp_write_timer_handler(struct sock return; } diff --git a/debian/patches/misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch similarity index 95% rename from debian/patches/misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch index 8c05364..0dfd1ba 100644 --- a/debian/patches/misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch @@ -1,7 +1,7 @@ -From 9fd50b0891febee43126ba643bfe56f72dd23bad Mon Sep 17 00:00:00 2001 +From 9aa33a35b5b9cbe65c87e6f9438e69ede143d11a Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Sun, 24 Jun 2018 21:55:59 -0400 -Subject: [PATCH 02/19] net-tcp_bbr: v2: shrink delivered_mstamp, +Subject: net-tcp_bbr: v2: shrink delivered_mstamp, first_tx_mstamp to u32 to free up 8 bytes Free up some space for tracking inflight and losses for each diff --git a/debian/patches/misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch similarity index 96% rename from debian/patches/misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch index 909ff0f..e0d7062 100644 --- a/debian/patches/misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch @@ -1,7 +1,7 @@ -From 7c5c7e14043aaf99390cb9c71140f63cd574ffda Mon Sep 17 00:00:00 2001 +From 63e1d064c4e4355293b9ee7014f4559cdeba4b8b Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Sat, 5 Aug 2017 11:49:50 -0400 -Subject: [PATCH 03/19] net-tcp_bbr: v2: snapshot packets in flight at transmit +Subject: net-tcp_bbr: v2: snapshot packets in flight at transmit time and pass in rate_sample CC algorithms may want to snapshot the number of packets in flight at diff --git a/debian/patches/misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch similarity index 95% rename from debian/patches/misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch index 26326c7..9fe6d7d 100644 --- a/debian/patches/misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch @@ -1,7 +1,7 @@ -From 205288c0ef4b4059c0ae8a2bb85b74a1c371d700 Mon Sep 17 00:00:00 2001 +From 4022fb6da58dd67760dc8f3351067945a377df91 Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Thu, 12 Oct 2017 23:44:27 -0400 -Subject: [PATCH 04/19] net-tcp_bbr: v2: count packets lost over TCP rate +Subject: net-tcp_bbr: v2: count packets lost over TCP rate sampling interval For understanding the relationship between inflight and packet loss diff --git a/debian/patches/misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch similarity index 90% rename from debian/patches/misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch index f3391d9..1faa17a 100644 --- a/debian/patches/misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch @@ -1,7 +1,7 @@ -From 1fe2a421fbf80580ad76f528c6489633667e5851 Mon Sep 17 00:00:00 2001 +From 3ff71ca0a15ebe4e5db9c0089121eafd2efc02ba Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Mon, 19 Nov 2018 13:48:36 -0500 -Subject: [PATCH 05/19] net-tcp_bbr: v2: export FLAG_ECE in rate_sample.is_ece +Subject: net-tcp_bbr: v2: export FLAG_ECE in rate_sample.is_ece For understanding the relationship between inflight and ECN signals, to try to find the highest inflight value that has acceptable levels diff --git a/debian/patches/misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch similarity index 93% rename from debian/patches/misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch index 4ab93fe..4686a87 100644 --- a/debian/patches/misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch @@ -1,7 +1,7 @@ -From 96bcebd73a3df154c7c5100694deb069a2157655 Mon Sep 17 00:00:00 2001 +From fa9348cbc2b5a0f1f3fc82e51ae6ce956f8cfb1f Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Tue, 7 Aug 2018 21:52:06 -0400 -Subject: [PATCH 06/19] net-tcp_bbr: v2: introduce ca_ops->skb_marked_lost() CC +Subject: net-tcp_bbr: v2: introduce ca_ops->skb_marked_lost() CC module callback API For connections experiencing reordering, RACK can mark packets lost diff --git a/debian/patches/misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch similarity index 94% rename from debian/patches/misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch index e498a0c..2008c9a 100644 --- a/debian/patches/misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch @@ -1,7 +1,7 @@ -From ab109789c18f7edf7a34923398a64ba7ba38cc6c Mon Sep 17 00:00:00 2001 +From 3add8086d7d76fe240fb341a4e49149ac4332990 Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Wed, 1 May 2019 20:16:33 -0400 -Subject: [PATCH 07/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon merge in +Subject: net-tcp_bbr: v2: adjust skb tx.in_flight upon merge in tcp_shifted_skb() When tcp_shifted_skb() updates state as adjacent SACKed skbs are diff --git a/debian/patches/misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch similarity index 96% rename from debian/patches/misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch index a9f991b..8dd5442 100644 --- a/debian/patches/misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch @@ -1,7 +1,7 @@ -From 6d0d550fdc6692ee65d01453d380ffba4b5a97e9 Mon Sep 17 00:00:00 2001 +From 6363d43645b3383ba590d0574dc37a215386aacf Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Wed, 1 May 2019 20:16:25 -0400 -Subject: [PATCH 08/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon split in +Subject: net-tcp_bbr: v2: adjust skb tx.in_flight upon split in tcp_fragment() When we fragment an skb that has already been sent, we need to update diff --git a/debian/patches/misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch similarity index 94% rename from debian/patches/misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch index 6f7f90e..2997ff4 100644 --- a/debian/patches/misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch @@ -1,7 +1,7 @@ -From 55dba52fd12522bb1c211acdd37f051e8bf5c57b Mon Sep 17 00:00:00 2001 +From 8c1b5bf6012099cba8911e255487bca5d0a2bd02 Mon Sep 17 00:00:00 2001 From: Yousuk Seung <ysseung@google.com> Date: Wed, 23 May 2018 17:55:54 -0700 -Subject: [PATCH 09/19] net-tcp: add new ca opts flag TCP_CONG_WANTS_CE_EVENTS +Subject: net-tcp: add new ca opts flag TCP_CONG_WANTS_CE_EVENTS Add a a new ca opts flag TCP_CONG_WANTS_CE_EVENTS that allows a congestion control module to receive CE events. diff --git a/debian/patches/misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch similarity index 81% rename from debian/patches/misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch index 2bf2013..a305b07 100644 --- a/debian/patches/misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch @@ -1,7 +1,7 @@ -From 2c015b38804583667528b976c1cc9f9c1c42c104 Mon Sep 17 00:00:00 2001 +From 15fd38de916127d286bd373903fdfa5215b05aa4 Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Fri, 27 Sep 2019 17:10:26 -0400 -Subject: [PATCH 10/19] net-tcp: re-generalize TSO sizing in TCP CC module API +Subject: net-tcp: re-generalize TSO sizing in TCP CC module API Reorganize the API for CC modules so that the CC module once again gets complete control of the TSO sizing decision. This is how the API @@ -21,9 +21,10 @@ Change-Id: Ic8ccfdbe4010ee8d4bf6a6334c48a2fceb2171ea Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- include/net/tcp.h | 4 ++-- + net/ipv4/bpf_tcp_ca.c | 4 ++-- net/ipv4/tcp_bbr.c | 37 ++++++++++++++++++++++++++----------- net/ipv4/tcp_output.c | 11 +++++------ - 3 files changed, 33 insertions(+), 19 deletions(-) + 4 files changed, 35 insertions(+), 21 deletions(-) --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -38,6 +39,26 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> /* react to a specific lost skb (optional) */ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } +@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_o + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -301,20 +301,35 @@ __bpf_kfunc static u32 bbr_min_tso_segs( @@ -82,7 +103,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> - segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - return min(segs, 0x7FU); -+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ diff --git a/debian/patches/misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch similarity index 67% rename from debian/patches/misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch index 58df491..ce7a926 100644 --- a/debian/patches/misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch @@ -1,14 +1,15 @@ -From 3002091ca590f27c6c7c5966883502c87502e01f Mon Sep 17 00:00:00 2001 +From 344af0ac329b2b1ce5f1ce920166e4aeb5e83037 Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> -Date: Sun, 7 Jan 2024 21:11:26 -0300 -Subject: [PATCH 11/19] net-tcp: add fast_ack_mode=1: skip rwin check in +Date: Sat, 16 Nov 2019 13:16:25 -0500 +Subject: net-tcp: add fast_ack_mode=1: skip rwin check in tcp_fast_ack_mode__tcp_ack_snd_check() -Add logic for an experimental TCP connection behavior, enabled with +Add logic for an optional TCP connection behavior, enabled with tp->fast_ack_mode = 1, which disables checking the receive window before sending an ack in __tcp_ack_snd_check(). If this behavior is enabled, the data receiver sends an ACK if the amount of data is > -RCV.MSS. +RCV.MSS. TCP congestion control modules can enable this bit if +they want to generate ACKs quickly. Change-Id: Iaa0a0fd7108221f883137a79d5bfa724f1b096d4 Signed-off-by: Alexandre Frade <kernel@xanmod.org> @@ -21,19 +22,19 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/include/linux/tcp.h +++ b/include/linux/tcp.h -@@ -369,7 +369,8 @@ struct tcp_sock { - u8 compressed_ack; - u8 dup_ack_counter:2, - tlp_retrans:1, /* TLP is a retransmission */ -- unused:5; -+ fast_ack_mode:2, /* which fast ack mode ? */ -+ unused:3; - u8 thin_lto : 1,/* Use linear timeouts for thin streams */ - fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ - fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ +@@ -243,7 +243,8 @@ struct tcp_sock { + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + u32 snd_ssthresh; /* Slow start size threshold */ +- u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ ++ u32 recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ ++ fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */ + __cacheline_group_end(tcp_sock_read_rx); + + /* TX read-write hotpath cache lines */ --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c -@@ -3397,6 +3397,7 @@ int tcp_disconnect(struct sock *sk, int +@@ -3398,6 +3398,7 @@ int tcp_disconnect(struct sock *sk, int tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; @@ -53,7 +54,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> if (tcp_ca_needs_ecn(sk)) --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c -@@ -5776,13 +5776,14 @@ static void __tcp_ack_snd_check(struct s +@@ -5782,13 +5782,14 @@ static void __tcp_ack_snd_check(struct s /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && diff --git a/debian/patches/misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch similarity index 63% rename from debian/patches/misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch index 94c918f..fee7d70 100644 --- a/debian/patches/misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch @@ -1,7 +1,7 @@ -From 687f09c22583ec5ef52aa93844248c9f93a2ce6e Mon Sep 17 00:00:00 2001 +From 18f564dbe586ab02c48563a9e05e71aa1a421607 Mon Sep 17 00:00:00 2001 From: Jianfeng Wang <jfwang@google.com> Date: Fri, 19 Jun 2020 17:33:45 +0000 -Subject: [PATCH 12/19] net-tcp_bbr: v2: record app-limited status of +Subject: net-tcp_bbr: v2: record app-limited status of TLP-repaired flight When sending a TLP retransmit, record whether the outstanding flight @@ -23,19 +23,19 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/include/linux/tcp.h +++ b/include/linux/tcp.h -@@ -370,7 +370,8 @@ struct tcp_sock { - u8 dup_ack_counter:2, - tlp_retrans:1, /* TLP is a retransmission */ - fast_ack_mode:2, /* which fast ack mode ? */ -- unused:3; -+ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ -+ unused:2; - u8 thin_lto : 1,/* Use linear timeouts for thin streams */ - fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ - fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ +@@ -301,7 +301,8 @@ struct tcp_sock { + */ + struct tcp_options_received rx_opt; + u8 nonagle : 4,/* Disable Nagle algorithm? */ +- rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ ++ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ ++ tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */ + __cacheline_group_end(tcp_sock_write_txrx); + + /* RX read-write hotpath cache lines */ --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c -@@ -3008,6 +3008,7 @@ void tcp_send_loss_probe(struct sock *sk +@@ -3006,6 +3006,7 @@ void tcp_send_loss_probe(struct sock *sk if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; diff --git a/debian/patches/misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch similarity index 91% rename from debian/patches/misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch index 1622cfe..368e661 100644 --- a/debian/patches/misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch @@ -1,7 +1,7 @@ -From 024469ad0aec82573e1aa8f3dde55aeac4c35aee Mon Sep 17 00:00:00 2001 +From 8da6e7d31a73453ce8495f004951069f5ef67c86 Mon Sep 17 00:00:00 2001 From: Jianfeng Wang <jfwang@google.com> Date: Tue, 16 Jun 2020 17:41:19 +0000 -Subject: [PATCH 13/19] net-tcp_bbr: v2: inform CC module of losses repaired by +Subject: net-tcp_bbr: v2: inform CC module of losses repaired by TLP probe Before this commit, when there is a packet loss that creates a sequence diff --git a/debian/patches/misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch similarity index 94% rename from debian/patches/misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch index ba9e266..bd3c8c7 100644 --- a/debian/patches/misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch @@ -1,7 +1,7 @@ -From 31adbbdcafaeac73d39cae76c6d513fea28779f1 Mon Sep 17 00:00:00 2001 +From 528d5f9d97954b32db6ae1fe1729c4965886b6df Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Mon, 21 Sep 2020 14:46:26 -0400 -Subject: [PATCH 14/19] net-tcp_bbr: v2: introduce is_acking_tlp_retrans_seq +Subject: net-tcp_bbr: v2: introduce is_acking_tlp_retrans_seq into rate_sample Introduce is_acking_tlp_retrans_seq into rate_sample. This bool will diff --git a/debian/patches/misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch similarity index 91% rename from debian/patches/misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch index 4d5b796..1bc99f1 100644 --- a/debian/patches/misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch @@ -1,7 +1,7 @@ -From 5e219e6228cb7b13a7d9a1d05c6e4846363fd6fe Mon Sep 17 00:00:00 2001 +From a086cf589b0ab974965d88d338c0a373eff5d67c Mon Sep 17 00:00:00 2001 From: David Morley <morleyd@google.com> Date: Fri, 14 Jul 2023 11:07:56 -0400 -Subject: [PATCH 15/19] tcp: introduce per-route feature RTAX_FEATURE_ECN_LOW +Subject: tcp: introduce per-route feature RTAX_FEATURE_ECN_LOW Define and implement a new per-route feature, RTAX_FEATURE_ECN_LOW. @@ -41,9 +41,9 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> enum tcp_tw_status { TCP_TW_SUCCESS = 0, -@@ -794,6 +795,15 @@ static inline void tcp_fast_path_check(s - tcp_fast_path_on(tp); - } +@@ -796,6 +797,15 @@ static inline void tcp_fast_path_check(s + + u32 tcp_delack_max(const struct sock *sk); +static inline void tcp_set_ecn_low_from_dst(struct sock *sk, + const struct dst_entry *dst) @@ -54,12 +54,12 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> + tp->ecn_flags |= TCP_ECN_LOW; +} + - u32 tcp_delack_max(const struct sock *sk); - /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h -@@ -507,12 +507,14 @@ enum { +@@ -516,12 +516,14 @@ enum { #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) @@ -77,7 +77,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> __u8 proto; --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c -@@ -462,6 +462,8 @@ void tcp_ca_openreq_child(struct sock *s +@@ -471,6 +471,8 @@ void tcp_ca_openreq_child(struct sock *s u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; diff --git a/debian/patches/misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch similarity index 97% rename from debian/patches/misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch index b2a1b4a..9b58e1f 100644 --- a/debian/patches/misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch @@ -1,7 +1,7 @@ -From de7a1729144df5a664b32643fc2246da8021e01c Mon Sep 17 00:00:00 2001 +From 3259adaa6771b29fdf023acffe469979cdd1caae Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Tue, 11 Jun 2019 12:54:22 -0400 -Subject: [PATCH 16/19] net-tcp_bbr: v3: update TCP "bbr" congestion control +Subject: net-tcp_bbr: v3: update TCP "bbr" congestion control module to BBRv3 BBR v3 is an enhacement to the BBR v1 algorithm. It's designed to aim for lower @@ -135,8 +135,8 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> include/net/tcp.h | 2 +- include/uapi/linux/inet_diag.h | 23 + net/ipv4/Kconfig | 21 +- - net/ipv4/tcp_bbr.c | 2214 +++++++++++++++++++++------- - 5 files changed, 1740 insertions(+), 524 deletions(-) + net/ipv4/tcp_bbr.c | 2217 +++++++++++++++++++++------- + 5 files changed, 1742 insertions(+), 525 deletions(-) --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -439,7 +439,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> + BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ + BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ + BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ -+ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ ++ BBR_BW_PROBE_REFILL = 3, /* refill the pipe again to 100% */ }; -/* Randomize the starting gain cycling phase over N phases: */ -static const u32 bbr_cycle_rand = 7; @@ -659,16 +659,17 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> static void bbr_init_pacing_rate_from_rtt(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); -@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rt +@@ -279,7 +456,8 @@ static void bbr_init_pacing_rate_from_rt bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; do_div(bw, rtt_us); WRITE_ONCE(sk->sk_pacing_rate, - bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); -+ bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain))); ++ bbr_bw_to_pacing_rate(sk, bw, ++ bbr_param(sk, startup_pacing_gain))); } /* Pace using current bw estimate and a gain factor. */ -@@ -295,31 +472,38 @@ static void bbr_set_pacing_rate(struct s +@@ -295,31 +473,38 @@ static void bbr_set_pacing_rate(struct s WRITE_ONCE(sk->sk_pacing_rate, rate); } @@ -718,16 +719,16 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> { return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); } -@@ -329,7 +513,7 @@ static u32 bbr_tso_segs_goal(struct sock +@@ -329,7 +514,7 @@ static u32 bbr_tso_segs_goal(struct sock { struct tcp_sock *tp = tcp_sk(sk); -- return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); -+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); +- return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ -@@ -349,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(s +@@ -349,7 +534,9 @@ __bpf_kfunc static void bbr_cwnd_event(s struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); @@ -738,7 +739,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> bbr->idle_restart = 1; bbr->ack_epoch_mstamp = tp->tcp_mstamp; bbr->ack_epoch_acked = 0; -@@ -360,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(s +@@ -360,6 +547,16 @@ __bpf_kfunc static void bbr_cwnd_event(s bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); else if (bbr->mode == BBR_PROBE_RTT) bbr_check_probe_rtt_done(sk); @@ -755,7 +756,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> } } -@@ -382,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 +@@ -382,10 +579,10 @@ static u32 bbr_bdp(struct sock *sk, u32 * default. This should only happen when the connection is not using TCP * timestamps and has retransmitted all of the SYN/SYNACK/data packets * ACKed so far. In this case, an RTO can cut cwnd to 1, in which @@ -768,7 +769,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> w = (u64)bw * bbr->min_rtt_us; -@@ -402,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 +@@ -402,23 +599,23 @@ static u32 bbr_bdp(struct sock *sk, u32 * - one skb in sending host Qdisc, * - one skb in sending host TSO/GSO engine * - one skb being received by receiver host LRO/GRO/delayed-ACK engine @@ -800,7 +801,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> cwnd += 2; return cwnd; -@@ -473,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(stru +@@ -473,10 +670,10 @@ static u32 bbr_ack_aggregation_cwnd(stru { u32 max_aggr_cwnd, aggr_cwnd = 0; @@ -813,7 +814,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> >> BBR_SCALE; aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); } -@@ -484,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(stru +@@ -484,66 +681,27 @@ static u32 bbr_ack_aggregation_cwnd(stru return aggr_cwnd; } @@ -887,7 +888,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> target_cwnd = bbr_bdp(sk, bw, gain); /* Increment the cwnd to account for excess ACKed data that seems -@@ -552,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk +@@ -552,74 +710,26 @@ static void bbr_set_cwnd(struct sock *sk target_cwnd += bbr_ack_aggregation_cwnd(sk); target_cwnd = bbr_quantization_budget(sk, target_cwnd); @@ -979,7 +980,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> } static void bbr_reset_startup_mode(struct sock *sk) -@@ -629,191 +738,49 @@ static void bbr_reset_startup_mode(struc +@@ -629,191 +739,49 @@ static void bbr_reset_startup_mode(struc bbr->mode = BBR_STARTUP; } @@ -1195,7 +1196,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> } /* Estimates the windowed max degree of ack aggregation. -@@ -827,7 +794,7 @@ static void bbr_update_bw(struct sock *s +@@ -827,7 +795,7 @@ static void bbr_update_bw(struct sock *s * * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). * Max filter is an approximate sliding window of 5-10 (packet timed) round @@ -1204,7 +1205,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> */ static void bbr_update_ack_aggregation(struct sock *sk, const struct rate_sample *rs) -@@ -835,15 +802,19 @@ static void bbr_update_ack_aggregation(s +@@ -835,15 +803,19 @@ static void bbr_update_ack_aggregation(s u32 epoch_us, expected_acked, extra_acked; struct bbr *bbr = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -1226,7 +1227,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> bbr->extra_acked_win_rtts = 0; bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? 0 : 1; -@@ -877,49 +848,6 @@ static void bbr_update_ack_aggregation(s +@@ -877,49 +849,6 @@ static void bbr_update_ack_aggregation(s bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; } @@ -1276,7 +1277,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> static void bbr_check_probe_rtt_done(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); -@@ -929,9 +857,9 @@ static void bbr_check_probe_rtt_done(str +@@ -929,9 +858,9 @@ static void bbr_check_probe_rtt_done(str after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) return; @@ -1288,7 +1289,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> } /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and -@@ -957,23 +885,35 @@ static void bbr_update_min_rtt(struct so +@@ -957,23 +886,35 @@ static void bbr_update_min_rtt(struct so { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); @@ -1333,7 +1334,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> } if (bbr->mode == BBR_PROBE_RTT) { -@@ -982,9 +922,9 @@ static void bbr_update_min_rtt(struct so +@@ -982,9 +923,9 @@ static void bbr_update_min_rtt(struct so (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; /* Maintain min packets in flight for max(200 ms, 1 round). */ if (!bbr->probe_rtt_done_stamp && @@ -1345,7 +1346,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> bbr->probe_rtt_round_done = 0; bbr->next_rtt_delivered = tp->delivered; } else if (bbr->probe_rtt_done_stamp) { -@@ -1005,18 +945,20 @@ static void bbr_update_gains(struct sock +@@ -1005,18 +946,20 @@ static void bbr_update_gains(struct sock switch (bbr->mode) { case BBR_STARTUP: @@ -1374,7 +1375,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> break; case BBR_PROBE_RTT: bbr->pacing_gain = BBR_UNIT; -@@ -1028,27 +970,1108 @@ static void bbr_update_gains(struct sock +@@ -1028,27 +971,1108 @@ static void bbr_update_gains(struct sock } } @@ -1487,7 +1488,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> + + /* See if we should use ECN sender logic for this connection. */ + if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && -+ bbr_param(sk, ecn_factor) && ++ !!bbr_param(sk, ecn_factor) && + (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || + !bbr_ecn_max_rtt_us)) + bbr->ecn_eligible = 1; @@ -1576,8 +1577,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> + +/* Does loss/ECN rate for this sample say inflight is "too high"? + * This is used by both the bbr_check_loss_too_high_in_startup() function, -+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which -+ * uses it to notice when loss/ECN rates suggest inflight is too high. ++ * and in PROBE_UP. + */ +static bool bbr_is_inflight_too_high(const struct sock *sk, + const struct rate_sample *rs) @@ -1594,7 +1594,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> + } + + if (rs->delivered_ce > 0 && rs->delivered > 0 && -+ bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { ++ bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) { + ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> + BBR_SCALE; + if (rs->delivered_ce > ecn_thresh) { @@ -1792,7 +1792,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> + return; + + /* ECN response. */ -+ if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { ++ if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) { + bbr_init_lower_bounds(sk, false); + bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); + } @@ -2394,8 +2394,9 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> + bbr_update_cycle_phase(sk, rs, ctx); bbr_update_min_rtt(sk, rs); - bbr_update_gains(sk); -+} -+ + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) +/* Fast path for app-limited case. + * + * On each ack, we execute bbr state machine, which primarily consists of: @@ -2445,9 +2446,10 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> + *update_model = false; + } + return false; - } - - __bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, ++ const struct rate_sample *rs) { + struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); @@ -2492,7 +2494,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> } __bpf_kfunc static void bbr_init(struct sock *sk) -@@ -1056,20 +2079,21 @@ __bpf_kfunc static void bbr_init(struct +@@ -1056,20 +2080,21 @@ __bpf_kfunc static void bbr_init(struct struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); @@ -2519,7 +2521,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> bbr->has_seen_rtt = 0; bbr_init_pacing_rate_from_rtt(sk); -@@ -1080,7 +2104,7 @@ __bpf_kfunc static void bbr_init(struct +@@ -1080,7 +2105,7 @@ __bpf_kfunc static void bbr_init(struct bbr->full_bw_cnt = 0; bbr->cycle_mstamp = 0; bbr->cycle_idx = 0; @@ -2528,7 +2530,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> bbr_reset_startup_mode(sk); bbr->ack_epoch_mstamp = tp->tcp_mstamp; -@@ -1090,78 +2114,236 @@ __bpf_kfunc static void bbr_init(struct +@@ -1090,78 +2115,236 @@ __bpf_kfunc static void bbr_init(struct bbr->extra_acked[0] = 0; bbr->extra_acked[1] = 0; @@ -2793,7 +2795,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> .undo_cwnd = bbr_undo_cwnd, .cwnd_event = bbr_cwnd_event, .ssthresh = bbr_ssthresh, -@@ -1174,10 +2356,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids +@@ -1174,10 +2357,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids BTF_ID_FLAGS(func, bbr_init) BTF_ID_FLAGS(func, bbr_main) BTF_ID_FLAGS(func, bbr_sndbuf_expand) @@ -2806,7 +2808,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> BTF_ID_FLAGS(func, bbr_set_state) BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) -@@ -1210,5 +2393,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google +@@ -1210,5 +2394,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>"); MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>"); MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>"); diff --git a/debian/patches/misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch similarity index 90% rename from debian/patches/misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch index 30257dd..3f028e5 100644 --- a/debian/patches/misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch @@ -1,7 +1,7 @@ -From 8a87d0d851b9c06455a2def28fa8c2624ffa2e1a Mon Sep 17 00:00:00 2001 +From 79dbc43c63d17b05e0b04c6ed68b5e24515cfe2f Mon Sep 17 00:00:00 2001 From: Adithya Abraham Philip <abrahamphilip@google.com> Date: Fri, 11 Jun 2021 21:56:10 +0000 -Subject: [PATCH 17/19] net-tcp_bbr: v3: ensure ECN-enabled BBR flows set ECT +Subject: net-tcp_bbr: v3: ensure ECN-enabled BBR flows set ECT on retransmits Adds a new flag TCP_ECN_ECT_PERMANENT that is used by CCAs to @@ -35,7 +35,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> TCP_TW_SUCCESS = 0, --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c -@@ -2151,6 +2151,9 @@ __bpf_kfunc static void bbr_init(struct +@@ -2152,6 +2152,9 @@ __bpf_kfunc static void bbr_init(struct bbr->plb.pause_until = 0; tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; diff --git a/debian/patches/misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch b/debian/patches/patchset-xanmod/net/tcp/bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch similarity index 81% rename from debian/patches/misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch rename to debian/patches/patchset-xanmod/net/tcp/bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch index 70da5e6..03785e1 100644 --- a/debian/patches/misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch +++ b/debian/patches/patchset-xanmod/net/tcp/bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch @@ -1,7 +1,7 @@ -From 4cd6a6f7a76a9acd5acc590dfbde3b1386a9e11e Mon Sep 17 00:00:00 2001 +From 74f5a9e717fb41742cf30802e9f9c55c001d2576 Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Sun, 23 Jul 2023 23:25:34 -0400 -Subject: [PATCH 18/19] tcp: export TCPI_OPT_ECN_LOW in tcp_info tcpi_options +Subject: tcp: export TCPI_OPT_ECN_LOW in tcp_info tcpi_options field Analogous to other important ECN information, export TCPI_OPT_ECN_LOW @@ -21,13 +21,13 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ -+#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN configured at init */ ++#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN enabled at conn init */ /* * Sender's congestion state indicating normal or abnormal situations --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c -@@ -4124,6 +4124,8 @@ void tcp_get_info(struct sock *sk, struc +@@ -4125,6 +4125,8 @@ void tcp_get_info(struct sock *sk, struc info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; diff --git a/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch b/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch index 76f3e94..9241875 100644 --- a/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch +++ b/debian/patches/patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch @@ -67,7 +67,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> TP_PROTO(const struct sock *sk, const struct request_sock *req), --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c -@@ -1558,6 +1558,13 @@ static struct ctl_table ipv4_net_table[] +@@ -1568,6 +1568,13 @@ static struct ctl_table ipv4_net_table[] .extra2 = SYSCTL_ONE, }, { @@ -83,7 +83,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> .maxlen = sizeof(u8), --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c -@@ -5658,6 +5658,7 @@ static bool tcp_prune_ofo_queue(struct s +@@ -5664,6 +5664,7 @@ static bool tcp_prune_ofo_queue(struct s static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -91,7 +91,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); -@@ -5669,6 +5670,39 @@ static int tcp_prune_queue(struct sock * +@@ -5675,6 +5676,39 @@ static int tcp_prune_queue(struct sock * if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@ -131,7 +131,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, NULL, -@@ -5687,6 +5721,8 @@ static int tcp_prune_queue(struct sock * +@@ -5693,6 +5727,8 @@ static int tcp_prune_queue(struct sock * if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@ -142,7 +142,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> * and hopefully then we'll have sufficient space. --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c -@@ -3525,6 +3525,7 @@ static int __net_init tcp_sk_init(struct +@@ -3530,6 +3530,7 @@ static int __net_init tcp_sk_init(struct net->ipv4.sysctl_tcp_syn_linear_timeouts = 4; net->ipv4.sysctl_tcp_shrink_window = 0; diff --git a/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch b/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch index 7330405..e80e364 100644 --- a/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch +++ b/debian/patches/patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch @@ -1,4 +1,4 @@ -From d79c32bd2a17e206d1c198570ef705549d0f644b Mon Sep 17 00:00:00 2001 +From 2eb935c59e24cc1303dcb7153261be0a1b61b38b Mon Sep 17 00:00:00 2001 From: Mark Weiman <mark.weiman@markzz.com> Date: Sun, 12 Aug 2018 11:36:21 -0400 Subject: [PATCH] PCI: Enable overrides for missing ACS capabilities @@ -55,7 +55,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4475,6 +4475,15 @@ +@@ -4644,6 +4644,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. diff --git a/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch b/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch index 591609e..2eda7a9 100644 --- a/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch +++ b/debian/patches/patchset-xanmod/valve/0001-extcon-Add-driver-for-Steam-Deck.patch @@ -1,7 +1,7 @@ -From e914b6a0d571a92db04869a02e06dc83ec7c0700 Mon Sep 17 00:00:00 2001 +From cd6bf6bb5fd26e58638aa441dacd9104eb990fe5 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov <andrew.smirnov@gmail.com> Date: Sun, 27 Feb 2022 14:46:08 -0800 -Subject: [PATCH 1/6] extcon: Add driver for Steam Deck +Subject: extcon: Add driver for Steam Deck (cherry picked from commit f9f2eddae582ae39d5f89c1218448fc259b90aa8) Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com> diff --git a/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch b/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch index 5604c53..69cdd0c 100644 --- a/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch +++ b/debian/patches/patchset-xanmod/valve/0002-hwmon-Add-driver-for-Steam-Deck-s-EC-sensors.patch @@ -1,7 +1,7 @@ -From 8fe7bb2680d3e1201fdf3329e51078831f32fe12 Mon Sep 17 00:00:00 2001 +From c4da1a4d0efa203d10fdceda267816f7838c8a85 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov <andrew.smirnov@gmail.com> Date: Sat, 19 Feb 2022 16:09:45 -0800 -Subject: [PATCH 2/6] hwmon: Add driver for Steam Deck's EC sensors +Subject: hwmon: Add driver for Steam Deck's EC sensors Add driver for sensors exposed by EC firmware on Steam Deck hardware. @@ -17,7 +17,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig -@@ -2053,6 +2053,17 @@ config SENSORS_SCH5636 +@@ -2089,6 +2089,17 @@ config SENSORS_SCH5636 This driver can also be built as a module. If so, the module will be called sch5636. @@ -37,7 +37,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> depends on I2C --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile -@@ -208,6 +208,7 @@ obj-$(CONFIG_SENSORS_SMSC47M1) += smsc47 +@@ -211,6 +211,7 @@ obj-$(CONFIG_SENSORS_SMSC47M1) += smsc47 obj-$(CONFIG_SENSORS_SMSC47M192)+= smsc47m192.o obj-$(CONFIG_SENSORS_SPARX5) += sparx5-temp.o obj-$(CONFIG_SENSORS_SPD5118) += spd5118.o diff --git a/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch b/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch index d8e9782..da70213 100644 --- a/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch +++ b/debian/patches/patchset-xanmod/valve/0003-hwmon-steamdeck-hwmon-Add-support-for-max-battery-le.patch @@ -1,7 +1,7 @@ -From 8181870b30687aa9351d919d082bc2b671a9c4cb Mon Sep 17 00:00:00 2001 +From 9f7d5453fd576ddf2c810146c5f61863b52d777d Mon Sep 17 00:00:00 2001 From: Andrey Smirnov <andrew.smirnov@gmail.com> Date: Sat, 15 Jul 2023 12:58:54 -0700 -Subject: [PATCH 3/6] hwmon: steamdeck-hwmon: Add support for max battery +Subject: hwmon: steamdeck-hwmon: Add support for max battery level/rate Add support for max battery level/charge rate attributes. diff --git a/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch b/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch index eb94a72..d909d83 100644 --- a/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch +++ b/debian/patches/patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch @@ -1,7 +1,7 @@ -From 4df11ab1bd9ad50e6ed928d1c2f3a8404775837b Mon Sep 17 00:00:00 2001 +From 93fc97eeb7fd11b7da124eab29c8d455331d364c Mon Sep 17 00:00:00 2001 From: Andrey Smirnov <andrew.smirnov@gmail.com> Date: Sun, 27 Feb 2022 12:58:05 -0800 -Subject: [PATCH 4/6] leds: steamdeck: Add support for Steam Deck LED +Subject: leds: steamdeck: Add support for Steam Deck LED (cherry picked from commit 85a86d19aa7022ff0555023d53aef78323a42d0c) Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com> @@ -15,7 +15,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig -@@ -959,6 +959,13 @@ config LEDS_ACER_A500 +@@ -1003,6 +1003,13 @@ config LEDS_ACER_A500 This option enables support for the Power Button LED of Acer Iconia Tab A500. @@ -31,10 +31,10 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> comment "Flash and Torch LED drivers" --- a/drivers/leds/Makefile +++ b/drivers/leds/Makefile -@@ -81,6 +81,7 @@ obj-$(CONFIG_LEDS_POWERNV) += leds-powe - obj-$(CONFIG_LEDS_PWM) += leds-pwm.o +@@ -84,6 +84,7 @@ obj-$(CONFIG_LEDS_QNAP_MCU) += leds-qna obj-$(CONFIG_LEDS_REGULATOR) += leds-regulator.o obj-$(CONFIG_LEDS_SC27XX_BLTC) += leds-sc27xx-bltc.o + obj-$(CONFIG_LEDS_ST1202) += leds-st1202.o +obj-$(CONFIG_LEDS_STEAMDECK) += leds-steamdeck.o obj-$(CONFIG_LEDS_SUN50I_A100) += leds-sun50i-a100.o obj-$(CONFIG_LEDS_SUNFIRE) += leds-sunfire.o diff --git a/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch b/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch index 71fb5ea..52347cf 100644 --- a/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch +++ b/debian/patches/patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch @@ -1,7 +1,7 @@ -From 947c953bf24af62c58e9eb0bab533816882b83a3 Mon Sep 17 00:00:00 2001 +From 544af2c7ba194f959e8b317efb6e82b229b8ceff Mon Sep 17 00:00:00 2001 From: Andrey Smirnov <andrew.smirnov@gmail.com> Date: Sat, 19 Feb 2022 16:08:36 -0800 -Subject: [PATCH 5/6] mfd: Add MFD core driver for Steam Deck +Subject: mfd: Add MFD core driver for Steam Deck Add MFD core driver for Steam Deck. Doesn't really do much so far besides instantiating a number of MFD cells that implement all the @@ -19,9 +19,9 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig -@@ -2402,5 +2402,16 @@ config MFD_RSMU_SPI - Additional drivers must be enabled in order to use the functionality - of the device. +@@ -2439,5 +2439,16 @@ config MFD_UPBOARD_FPGA + To compile this driver as a module, choose M here: the module will be + called upboard-fpga. +config MFD_STEAMDECK + tristate "Valve Steam Deck" @@ -38,10 +38,10 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> endif --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile -@@ -289,3 +289,5 @@ obj-$(CONFIG_MFD_ATC260X_I2C) += atc260x - - obj-$(CONFIG_MFD_RSMU_I2C) += rsmu_i2c.o rsmu_core.o +@@ -294,3 +294,5 @@ obj-$(CONFIG_MFD_RSMU_I2C) += rsmu_i2c.o obj-$(CONFIG_MFD_RSMU_SPI) += rsmu_spi.o rsmu_core.o + + obj-$(CONFIG_MFD_UPBOARD_FPGA) += upboard-fpga.o + +obj-$(CONFIG_MFD_STEAMDECK) += steamdeck.o --- /dev/null diff --git a/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch b/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch index f4ca45d..421bdf9 100644 --- a/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch +++ b/debian/patches/patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch @@ -1,7 +1,7 @@ -From 2f8a2543aa33103cf237853d5f2ca8999261dd0d Mon Sep 17 00:00:00 2001 +From cf5a7be3ab145c5743b673722ce01002dcdac3e6 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov <andrew.smirnov@gmail.com> Date: Sun, 24 Sep 2023 15:02:33 -0700 -Subject: [PATCH 6/6] mfd: steamdeck: Expose controller board power in sysfs +Subject: mfd: steamdeck: Expose controller board power in sysfs As of version 118 Deck's BIOS implements "SCBP" method that allows gating power of the controller board (VBUS). Add a basic WO method to diff --git a/debian/patches/patchset-xanmod/xanmod/0001-kbuild-Re-add-.config-file-required-to-sign-external.patch b/debian/patches/patchset-xanmod/xanmod/0001-kbuild-Re-add-.config-file-required-to-sign-external.patch new file mode 100644 index 0000000..6a1398a --- /dev/null +++ b/debian/patches/patchset-xanmod/xanmod/0001-kbuild-Re-add-.config-file-required-to-sign-external.patch @@ -0,0 +1,23 @@ +From 878cd0d9982ee6810036adce9e9c96cdb3714be1 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade <kernel@xanmod.org> +Date: Thu, 28 Nov 2024 22:55:27 +0000 +Subject: kbuild: Re-add .config file required to sign external + modules + +Signed-off-by: Alexandre Frade <kernel@xanmod.org> +--- + scripts/package/install-extmod-build | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/scripts/package/install-extmod-build ++++ b/scripts/package/install-extmod-build +@@ -44,6 +44,9 @@ mkdir -p "${destdir}" + fi + } | tar -c -f - -T - | tar -xf - -C "${destdir}" + ++# copy .config manually to be where it's expected to be ++cp "${KCONFIG_CONFIG}" "${destdir}/.config" ++ + # When ${CC} and ${HOSTCC} differ, rebuild host programs using ${CC}. + # + # This caters to host programs that participate in Kbuild. objtool and diff --git a/debian/patches/patchset-xanmod/xanmod/0001-kbuild-Remove-GCC-minimal-function-alignment.patch b/debian/patches/patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch similarity index 92% rename from debian/patches/patchset-xanmod/xanmod/0001-kbuild-Remove-GCC-minimal-function-alignment.patch rename to debian/patches/patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch index aae2822..f6c2efb 100644 --- a/debian/patches/patchset-xanmod/xanmod/0001-kbuild-Remove-GCC-minimal-function-alignment.patch +++ b/debian/patches/patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch @@ -1,7 +1,7 @@ -From 67e174927705e71b0d254ab6fab5af40193376a4 Mon Sep 17 00:00:00 2001 +From 6e1157f40aa2de736b79766c53f87dfe7de36bb5 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Sat, 31 Aug 2024 16:57:41 +0000 -Subject: [PATCH 03/18] kbuild: Remove GCC minimal function alignment +Subject: kbuild: Remove GCC minimal function alignment Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- @@ -12,7 +12,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/Makefile +++ b/Makefile -@@ -1004,15 +1004,8 @@ export CC_FLAGS_FPU +@@ -1056,15 +1056,8 @@ export CC_FLAGS_FPU export CC_FLAGS_NO_FPU ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0) @@ -30,7 +30,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> NOSTDINC_FLAGS += -nostdinc --- a/arch/Kconfig +++ b/arch/Kconfig -@@ -1667,18 +1667,6 @@ config FUNCTION_ALIGNMENT +@@ -1723,18 +1723,6 @@ config FUNCTION_ALIGNMENT default 4 if FUNCTION_ALIGNMENT_4B default 0 diff --git a/debian/patches/patchset-xanmod/xanmod/0002-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch b/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch similarity index 78% rename from debian/patches/patchset-xanmod/xanmod/0002-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch rename to debian/patches/patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch index eea1a0c..c7637f4 100644 --- a/debian/patches/patchset-xanmod/xanmod/0002-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch +++ b/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch @@ -1,7 +1,7 @@ -From 43c0eb6ded02d18daa26e0186ae2f92bec5bfb8f Mon Sep 17 00:00:00 2001 +From 91f0f89ac5315be99ea1aea5d732c68311f68bda Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Thu, 11 May 2023 19:41:41 +0000 -Subject: [PATCH 04/18] XANMOD: fair: Set scheduler tunable latencies to +Subject: XANMOD: fair: Set scheduler tunable latencies to unscaled Signed-off-by: Alexandre Frade <kernel@xanmod.org> @@ -11,7 +11,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c -@@ -66,7 +66,7 @@ +@@ -69,7 +69,7 @@ * * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ diff --git a/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch b/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch similarity index 85% rename from debian/patches/patchset-xanmod/xanmod/0003-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch rename to debian/patches/patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch index 23027f5..fb1d8c4 100644 --- a/debian/patches/patchset-xanmod/xanmod/0003-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch +++ b/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch @@ -1,7 +1,7 @@ -From b20c46d59b4102165248167bd5911c2d695679cc Mon Sep 17 00:00:00 2001 +From 5a126e141df4850073a8f057cc5eeb22e8f6ea57 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Sun, 15 Sep 2024 23:03:38 +0000 -Subject: [PATCH 05/18] XANMOD: sched: Add yield_type sysctl to reduce or +Subject: XANMOD: sched: Add yield_type sysctl to reduce or disable sched_yield Signed-off-by: Alexandre Frade <kernel@xanmod.org> @@ -12,7 +12,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c -@@ -1391,15 +1391,29 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t +@@ -1350,15 +1350,29 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t return ret; } @@ -53,7 +53,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -@@ -1631,6 +1632,15 @@ static struct ctl_table kern_table[] = { +@@ -1630,6 +1631,15 @@ static const struct ctl_table kern_table .proc_handler = proc_dointvec, }, #endif diff --git a/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch b/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch similarity index 90% rename from debian/patches/patchset-xanmod/xanmod/0004-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch rename to debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch index c254688..768c7e6 100644 --- a/debian/patches/patchset-xanmod/xanmod/0004-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch +++ b/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch @@ -1,7 +1,7 @@ -From d23f0554f1b381f082dc81a6f3c523b90043b941 Mon Sep 17 00:00:00 2001 +From f91c466320368433d644a1bbaeb303b682c6b7d1 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Wed, 11 May 2022 18:56:51 +0000 -Subject: [PATCH 06/18] XANMOD: block/mq-deadline: Increase write priority to +Subject: XANMOD: block/mq-deadline: Increase write priority to improve responsiveness Signed-off-by: Alexandre Frade <kernel@xanmod.org> diff --git a/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch b/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch similarity index 81% rename from debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch rename to debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch index ae21210..f1fd3a3 100644 --- a/debian/patches/patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch +++ b/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch @@ -1,7 +1,7 @@ -From 8c3035b22be106d8659d85c2651e589f53e89cc5 Mon Sep 17 00:00:00 2001 +From 99aceb32885686182f2e38ed6c19a380828249b7 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Thu, 6 Jan 2022 16:59:01 +0000 -Subject: [PATCH 07/18] XANMOD: block/mq-deadline: Disable front_merges by +Subject: XANMOD: block/mq-deadline: Disable front_merges by default Signed-off-by: Alexandre Frade <kernel@xanmod.org> diff --git a/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch b/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch similarity index 79% rename from debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch rename to debian/patches/patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch index e06abf5..41191af 100644 --- a/debian/patches/patchset-xanmod/xanmod/0006-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch +++ b/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch @@ -1,7 +1,7 @@ -From 3d1e3f450e9ca926a899a0502fd34df6d483efae Mon Sep 17 00:00:00 2001 +From e664c30c44caccc43b50a7cde90d4ad2a57faef2 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Mon, 16 Sep 2024 15:36:01 +0000 -Subject: [PATCH 08/18] XANMOD: block: Set rq_affinity to force complete I/O +Subject: XANMOD: block: Set rq_affinity to force complete I/O requests on same CPU Signed-off-by: Alexandre Frade <kernel@xanmod.org> @@ -11,7 +11,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h -@@ -614,7 +614,8 @@ enum { +@@ -626,7 +626,8 @@ enum { QUEUE_FLAG_MAX }; diff --git a/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch b/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch similarity index 83% rename from debian/patches/patchset-xanmod/xanmod/0007-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch rename to debian/patches/patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch index e06428f..fd867a7 100644 --- a/debian/patches/patchset-xanmod/xanmod/0007-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch +++ b/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch @@ -1,7 +1,7 @@ -From 80e2bd58a4f13d1a946c6616e18d124b1291f2a7 Mon Sep 17 00:00:00 2001 +From 34db71a0c7669de56fb221bacb4955012f52efa8 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Mon, 15 Jul 2024 04:50:34 +0000 -Subject: [PATCH 09/18] XANMOD: blk-wbt: Set wbt_default_latency_nsec() to +Subject: XANMOD: blk-wbt: Set wbt_default_latency_nsec() to 2msec Signed-off-by: Alexandre Frade <kernel@xanmod.org> diff --git a/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch b/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch similarity index 86% rename from debian/patches/patchset-xanmod/xanmod/0008-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch rename to debian/patches/patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch index e03b5e6..7525e40 100644 --- a/debian/patches/patchset-xanmod/xanmod/0008-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch +++ b/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch @@ -1,7 +1,7 @@ -From 74767b639b4e9141b1961764655111a4fd62a5ab Mon Sep 17 00:00:00 2001 +From 6f6902c8942b881988088c7f7d61053b41f00f0a Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Mon, 29 Jan 2018 17:26:15 +0000 -Subject: [PATCH 10/18] XANMOD: kconfig: add 500Hz timer interrupt kernel +Subject: XANMOD: kconfig: add 500Hz timer interrupt kernel config option Signed-off-by: Alexandre Frade <kernel@xanmod.org> diff --git a/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch b/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch similarity index 80% rename from debian/patches/patchset-xanmod/xanmod/0009-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch rename to debian/patches/patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch index ff618ae..bae2bbe 100644 --- a/debian/patches/patchset-xanmod/xanmod/0009-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch +++ b/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch @@ -1,7 +1,7 @@ -From a047058c64e9d75db8e714a8c1202057920e21c7 Mon Sep 17 00:00:00 2001 +From 269ed90bb0c714fc237be05611c82804f81b7038 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Mon, 29 Jan 2018 16:59:22 +0000 -Subject: [PATCH 11/18] XANMOD: dcache: cache_pressure = 50 decreases the rate +Subject: XANMOD: dcache: cache_pressure = 50 decreases the rate at which VFS caches are reclaimed Signed-off-by: Alexandre Frade <kernel@xanmod.org> diff --git a/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-mm-Raise-max_map_count-default-value.patch b/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-Raise-max_map_count-default-value.patch similarity index 90% rename from debian/patches/patchset-xanmod/xanmod/0010-XANMOD-mm-Raise-max_map_count-default-value.patch rename to debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-Raise-max_map_count-default-value.patch index fd6228c..8a120b0 100644 --- a/debian/patches/patchset-xanmod/xanmod/0010-XANMOD-mm-Raise-max_map_count-default-value.patch +++ b/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-Raise-max_map_count-default-value.patch @@ -1,7 +1,7 @@ -From 910bd8c627ea16ea9bcf70c153197aaba473b6b9 Mon Sep 17 00:00:00 2001 +From ba310efa15e3c9677121c31e79b72695bcca87df Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Sun, 28 Apr 2024 09:06:54 +0000 -Subject: [PATCH 12/18] XANMOD: mm: Raise max_map_count default value +Subject: XANMOD: mm: Raise max_map_count default value Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- @@ -22,7 +22,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> mem_profiling --- a/include/linux/mm.h +++ b/include/linux/mm.h -@@ -191,17 +191,18 @@ static inline void __mm_zero_struct_page +@@ -192,17 +192,18 @@ static inline void __mm_zero_struct_page * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. diff --git a/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch b/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch similarity index 75% rename from debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch rename to debian/patches/patchset-xanmod/xanmod/0012-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch index 0c7d7c1..e593597 100644 --- a/debian/patches/patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch +++ b/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch @@ -1,7 +1,7 @@ -From 1ad86d993666c2d74ed6fd97e143b073e4b2c4c9 Mon Sep 17 00:00:00 2001 +From 14ff7a682d0936937d6813105484da7b6245aabb Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Wed, 14 Aug 2024 18:54:53 +0000 -Subject: [PATCH 13/18] XANMOD: mm/vmscan: Set minimum amount of swapping +Subject: XANMOD: mm/vmscan: Set minimum amount of swapping Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- diff --git a/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch b/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch similarity index 87% rename from debian/patches/patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch rename to debian/patches/patchset-xanmod/xanmod/0013-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch index b0c9ba7..49c0b47 100644 --- a/debian/patches/patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch +++ b/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch @@ -1,7 +1,7 @@ -From a24ca4c968092cf419821aaaa57b070c088e74e7 Mon Sep 17 00:00:00 2001 +From 2354e3f9a9b181ca2e150c27c57a01049b52b6f0 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Wed, 15 Jun 2022 17:07:29 +0000 -Subject: [PATCH 14/18] XANMOD: sched/autogroup: Add kernel parameter and +Subject: XANMOD: sched/autogroup: Add kernel parameter and config option to enable/disable autogroup feature by default Signed-off-by: Alexandre Frade <kernel@xanmod.org> @@ -13,7 +13,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -498,6 +498,10 @@ +@@ -511,6 +511,10 @@ Format: <int> (must be >=0) Default: 64 @@ -24,9 +24,9 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> bau= [X86_UV] Enable the BAU on SGI UV. The default behavior is to disable the BAU (i.e. bau=0). Format: { "0" | "1" } -@@ -3881,8 +3885,6 @@ - noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any - IOAPICs that may be present in the system. +@@ -4039,8 +4043,6 @@ + + noapictimer [APIC,X86] Don't set up the APIC timer - noautogroup Disable scheduler automatic task group creation. - @@ -35,7 +35,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> no_console_suspend --- a/init/Kconfig +++ b/init/Kconfig -@@ -1353,6 +1353,18 @@ config SCHED_AUTOGROUP +@@ -1367,6 +1367,18 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. diff --git a/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch b/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch similarity index 94% rename from debian/patches/patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch rename to debian/patches/patchset-xanmod/xanmod/0014-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch index 86b7862..0b76274 100644 --- a/debian/patches/patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch +++ b/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch @@ -1,7 +1,7 @@ -From 4664b97efde786ff28f2eb234c1d59c9da30c3b4 Mon Sep 17 00:00:00 2001 +From fe02f80f7e47a5ae805393bcba3dbe8c2bd74b0e Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Tue, 31 Mar 2020 13:32:08 -0300 -Subject: [PATCH 15/18] XANMOD: cpufreq: tunes ondemand and conservative +Subject: XANMOD: cpufreq: tunes ondemand and conservative governor for performance Signed-off-by: Alexandre Frade <kernel@xanmod.org> diff --git a/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch b/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch similarity index 83% rename from debian/patches/patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch rename to debian/patches/patchset-xanmod/xanmod/0015-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch index a74c187..7054920 100644 --- a/debian/patches/patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch +++ b/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch @@ -1,7 +1,7 @@ -From 444f831f229a418b4865d11940b3987f55ab151f Mon Sep 17 00:00:00 2001 +From f2c2f7ec98ca5bfda92d4691af46403348ae0d77 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Mon, 16 Sep 2024 08:09:56 +0000 -Subject: [PATCH 16/18] XANMOD: lib/kconfig.debug: disable default +Subject: XANMOD: lib/kconfig.debug: disable default SYMBOLIC_ERRNAME and DEBUG_BUGVERBOSE Signed-off-by: Alexandre Frade <kernel@xanmod.org> @@ -12,14 +12,14 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig -@@ -23,7 +23,6 @@ config BCACHEFS_FS +@@ -24,7 +24,6 @@ config BCACHEFS_FS select XOR_BLOCKS select XXHASH select SRCU - select SYMBOLIC_ERRNAME + select MIN_HEAP help The bcachefs filesystem - a modern, copy on write filesystem, with - support for multiple devices, compression, checksumming, etc. --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -190,7 +190,7 @@ config DYNAMIC_DEBUG_CORE diff --git a/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch b/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch similarity index 70% rename from debian/patches/patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch rename to debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch index a5bb961..963e803 100644 --- a/debian/patches/patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch +++ b/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch @@ -1,7 +1,7 @@ -From 3536b212b829712a928b03cf513f3da87e15b3ef Mon Sep 17 00:00:00 2001 +From c706cd7134b55e1f188de6ea23e4b25b0497f18e Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Sun, 29 May 2022 00:57:40 +0000 -Subject: [PATCH 17/18] XANMOD: scripts/setlocalversion: remove "+" tag for git +Subject: XANMOD: scripts/setlocalversion: remove "+" tag for git repo short version Signed-off-by: Alexandre Frade <kernel@xanmod.org> @@ -11,7 +11,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/scripts/setlocalversion +++ b/scripts/setlocalversion -@@ -113,7 +113,6 @@ scm_version() +@@ -117,7 +117,6 @@ scm_version() # If only the short version is requested, don't bother # running further git commands if $short; then diff --git a/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch b/debian/patches/patchset-xanmod/xanmod/0017-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch similarity index 72% rename from debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch rename to debian/patches/patchset-xanmod/xanmod/0017-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch index 862e678..0495cd2 100644 --- a/debian/patches/patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch +++ b/debian/patches/patchset-xanmod/xanmod/0017-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch @@ -1,7 +1,7 @@ -From 857de795e16a927cf251e5ede247b6e96938916e Mon Sep 17 00:00:00 2001 +From 4c8da54c3f59b0e71408b0c980ffb162fc4bb022 Mon Sep 17 00:00:00 2001 From: Alexandre Frade <kernel@xanmod.org> Date: Mon, 24 Apr 2023 04:50:34 +0000 -Subject: [PATCH 18/18] XANMOD: scripts/setlocalversion: Move localversion* +Subject: XANMOD: scripts/setlocalversion: Move localversion* files to the end Signed-off-by: Alexandre Frade <kernel@xanmod.org> @@ -11,7 +11,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org> --- a/scripts/setlocalversion +++ b/scripts/setlocalversion -@@ -204,4 +204,4 @@ elif [ "${LOCALVERSION+set}" != "set" ]; +@@ -208,4 +208,4 @@ elif [ "${LOCALVERSION+set}" != "set" ]; scm_version="$(scm_version --short)" fi diff --git a/debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch b/debian/patches/patchset-zen/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch similarity index 75% rename from debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch rename to debian/patches/patchset-zen/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch index b0ca8a2..fcb0b83 100644 --- a/debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch +++ b/debian/patches/patchset-zen/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch @@ -1,17 +1,19 @@ -From cda0e050fec85635986e9cfe991e26339bf305dc Mon Sep 17 00:00:00 2001 +From 6dada600ab3579296c9b2b57cf41b95792f021ed Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org> Date: Sat, 13 Jan 2024 15:29:25 +0100 Subject: arch/Kconfig: Default to maximum amount of ASLR bits -To mitigate https://zolutal.github.io/aslrnt/; do this with a patch to -avoid having to enable `CONFIG_EXPERT`. +To mitigate CVE-2024-26621 and improve randomization quality further. Do +this with a patch to avoid having to enable `CONFIG_EXPERT`. + +Cherry-picked-for: https://zolutal.github.io/aslrnt/ --- arch/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- a/arch/Kconfig +++ b/arch/Kconfig -@@ -1089,7 +1089,7 @@ config ARCH_MMAP_RND_BITS +@@ -1137,7 +1137,7 @@ config ARCH_MMAP_RND_BITS int "Number of bits to use for ASLR of mmap base address" if EXPERT range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT @@ -20,7 +22,7 @@ avoid having to enable `CONFIG_EXPERT`. depends on HAVE_ARCH_MMAP_RND_BITS help This value can be used to select the number of bits to use to -@@ -1123,7 +1123,7 @@ config ARCH_MMAP_RND_COMPAT_BITS +@@ -1171,7 +1171,7 @@ config ARCH_MMAP_RND_COMPAT_BITS int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT diff --git a/debian/patches/patchset-zen/fixes/0001-futex-improve-user-space-accesses.patch b/debian/patches/patchset-zen/fixes/0001-futex-improve-user-space-accesses.patch deleted file mode 100644 index 14c7c38..0000000 --- a/debian/patches/patchset-zen/fixes/0001-futex-improve-user-space-accesses.patch +++ /dev/null @@ -1,162 +0,0 @@ -From 3c32c0d457a2c4b2817f57e1e2c9cbba4624639e Mon Sep 17 00:00:00 2001 -From: Linus Torvalds <torvalds@linux-foundation.org> -Date: Fri, 22 Nov 2024 11:33:05 -0800 -Subject: futex: improve user space accesses - -Josh Poimboeuf reports that he got a "will-it-scale.per_process_ops 1.9% -improvement" report for his patch that changed __get_user() to use -pointer masking instead of the explicit speculation barrier. However, -that patch doesn't actually work in the general case, because some (very -bad) architecture-specific code actually depends on __get_user() also -working on kernel addresses. - -A profile showed that the offending __get_user() was the futex code, -which really should be fixed up to not use that horrid legacy case. -Rewrite futex_get_value_locked() to use the modern user acccess helpers, -and inline it so that the compiler not only avoids the function call for -a few instructions, but can do CSE on the address masking. - -It also turns out the x86 futex functions have unnecessary barriers in -other places, so let's fix those up too. - -Link: https://lore.kernel.org/all/20241115230653.hfvzyf3aqqntgp63@jpoimboe/ -Reported-by: Josh Poimboeuf <jpoimboe@kernel.org> -Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> ---- - arch/x86/include/asm/futex.h | 8 +++-- - kernel/futex/core.c | 22 -------------- - kernel/futex/futex.h | 59 ++++++++++++++++++++++++++++++++++-- - 3 files changed, 63 insertions(+), 26 deletions(-) - ---- a/arch/x86/include/asm/futex.h -+++ b/arch/x86/include/asm/futex.h -@@ -48,7 +48,9 @@ do { \ - static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, - u32 __user *uaddr) - { -- if (!user_access_begin(uaddr, sizeof(u32))) -+ if (can_do_masked_user_access()) -+ uaddr = masked_user_access_begin(uaddr); -+ else if (!user_access_begin(uaddr, sizeof(u32))) - return -EFAULT; - - switch (op) { -@@ -84,7 +86,9 @@ static inline int futex_atomic_cmpxchg_i - { - int ret = 0; - -- if (!user_access_begin(uaddr, sizeof(u32))) -+ if (can_do_masked_user_access()) -+ uaddr = masked_user_access_begin(uaddr); -+ else if (!user_access_begin(uaddr, sizeof(u32))) - return -EFAULT; - asm volatile("\n" - "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" ---- a/kernel/futex/core.c -+++ b/kernel/futex/core.c -@@ -451,28 +451,6 @@ struct futex_q *futex_top_waiter(struct - return NULL; - } - --int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) --{ -- int ret; -- -- pagefault_disable(); -- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); -- pagefault_enable(); -- -- return ret; --} -- --int futex_get_value_locked(u32 *dest, u32 __user *from) --{ -- int ret; -- -- pagefault_disable(); -- ret = __get_user(*dest, from); -- pagefault_enable(); -- -- return ret ? -EFAULT : 0; --} -- - /** - * wait_for_owner_exiting - Block until the owner has exited - * @ret: owner's current futex lock status ---- a/kernel/futex/futex.h -+++ b/kernel/futex/futex.h -@@ -6,6 +6,7 @@ - #include <linux/rtmutex.h> - #include <linux/sched/wake_q.h> - #include <linux/compat.h> -+#include <linux/uaccess.h> - - #ifdef CONFIG_PREEMPT_RT - #include <linux/rcuwait.h> -@@ -225,10 +226,64 @@ extern bool __futex_wake_mark(struct fut - extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); - - extern int fault_in_user_writeable(u32 __user *uaddr); --extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval); --extern int futex_get_value_locked(u32 *dest, u32 __user *from); - extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); - -+static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) -+{ -+ int ret; -+ -+ pagefault_disable(); -+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); -+ pagefault_enable(); -+ -+ return ret; -+} -+ -+/* -+ * This does a plain atomic user space read, and the user pointer has -+ * already been verified earlier by get_futex_key() to be both aligned -+ * and actually in user space, just like futex_atomic_cmpxchg_inatomic(). -+ * -+ * We still want to avoid any speculation, and while __get_user() is -+ * the traditional model for this, it's actually slower then doing -+ * this manually these days. -+ * -+ * We could just have a per-architecture special function for it, -+ * the same way we do futex_atomic_cmpxchg_inatomic(), but rather -+ * than force everybody to do that, write it out long-hand using -+ * the low-level user-access infrastructure. -+ * -+ * This looks a bit overkill, but generally just results in a couple -+ * of instructions. -+ */ -+static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) -+{ -+ u32 val; -+ -+ if (can_do_masked_user_access()) -+ from = masked_user_access_begin(from); -+ else if (!user_read_access_begin(from, sizeof(*from))) -+ return -EFAULT; -+ unsafe_get_user(val, from, Efault); -+ user_access_end(); -+ *dest = val; -+ return 0; -+Efault: -+ user_access_end(); -+ return -EFAULT; -+} -+ -+static inline int futex_get_value_locked(u32 *dest, u32 __user *from) -+{ -+ int ret; -+ -+ pagefault_disable(); -+ ret = futex_read_inatomic(dest, from); -+ pagefault_enable(); -+ -+ return ret; -+} -+ - extern void __futex_unqueue(struct futex_q *q); - extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, - struct task_struct *task); diff --git a/debian/patches/patchset-pf/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch b/debian/patches/patchset-zen/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch similarity index 91% rename from debian/patches/patchset-pf/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch rename to debian/patches/patchset-zen/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch index a97bffc..e7811ec 100644 --- a/debian/patches/patchset-pf/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch +++ b/debian/patches/patchset-zen/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch @@ -1,4 +1,4 @@ -From 218e958524c673d6e68737e7f82d80ba2b6ef59a Mon Sep 17 00:00:00 2001 +From 5ac90c5aed97728c8f4f64c02d75334c84a801ef Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas <javierm@redhat.com> Date: Thu, 19 May 2022 14:40:07 +0200 Subject: drivers/firmware: skip simpledrm if nvidia-drm.modeset=1 is set @@ -42,7 +42,9 @@ For this to work, the CONFIG_FB_EFI and CONFIG_FB_VESA config options must be enabled besides CONFIG_DRM_SIMPLEDRM. Signed-off-by: Javier Martinez Canillas <javierm@redhat.com> +Source: https://gitlab.com/cki-project/kernel-ark/-/merge_requests/1788 Cherry-picked-for: https://bugs.archlinux.org/task/73720 +Cherry-picked-for: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/94 --- drivers/firmware/sysfb.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) @@ -72,7 +74,7 @@ Cherry-picked-for: https://bugs.archlinux.org/task/73720 static struct platform_device *pd; static DEFINE_MUTEX(disable_lock); static bool disabled; -@@ -145,7 +161,7 @@ static __init int sysfb_init(void) +@@ -164,7 +180,7 @@ static __init int sysfb_init(void) /* try to create a simple-framebuffer device */ compatible = sysfb_parse_mode(si, &mode); diff --git a/debian/patches/patchset-zen/fixes/0003-EDAC-igen6-Fix-the-flood-of-invalid-error-reports.patch b/debian/patches/patchset-zen/fixes/0003-EDAC-igen6-Fix-the-flood-of-invalid-error-reports.patch new file mode 100644 index 0000000..6328b9b --- /dev/null +++ b/debian/patches/patchset-zen/fixes/0003-EDAC-igen6-Fix-the-flood-of-invalid-error-reports.patch @@ -0,0 +1,56 @@ +From 69907adec3041a6a89d192441a61481d80ee5806 Mon Sep 17 00:00:00 2001 +From: Qiuxu Zhuo <qiuxu.zhuo@intel.com> +Date: Wed, 12 Feb 2025 16:33:54 +0800 +Subject: EDAC/igen6: Fix the flood of invalid error reports + +The ECC_ERROR_LOG register of certain SoCs may contain the invalid value +~0, which results in a flood of invalid error reports in polling mode. + +Fix the flood of invalid error reports by skipping the invalid ECC error +log value ~0. + +Fixes: e14232afa944 ("EDAC/igen6: Add polling support") +Reported-by: Ramses <ramses@well-founded.dev> +Closes: https://lore.kernel.org/all/OISL8Rv--F-9@well-founded.dev/ +Tested-by: Ramses <ramses@well-founded.dev> +Reported-by: John <therealgraysky@proton.me> +Closes: https://lore.kernel.org/all/p5YcxOE6M3Ncxpn2-Ia_wCt61EM4LwIiN3LroQvT_-G2jMrFDSOW5k2A9D8UUzD2toGpQBN1eI0sL5dSKnkO8iteZegLoQEj-DwQaMhGx4A=@proton.me/ +Tested-by: John <therealgraysky@proton.me> +Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> +Signed-off-by: Tony Luck <tony.luck@intel.com> +Link: https://lore.kernel.org/r/20250212083354.31919-1-qiuxu.zhuo@intel.com +--- + drivers/edac/igen6_edac.c | 21 +++++++++++++++------ + 1 file changed, 15 insertions(+), 6 deletions(-) + +--- a/drivers/edac/igen6_edac.c ++++ b/drivers/edac/igen6_edac.c +@@ -785,13 +785,22 @@ static u64 ecclog_read_and_clear(struct + { + u64 ecclog = readq(imc->window + ECC_ERROR_LOG_OFFSET); + +- if (ecclog & (ECC_ERROR_LOG_CE | ECC_ERROR_LOG_UE)) { +- /* Clear CE/UE bits by writing 1s */ +- writeq(ecclog, imc->window + ECC_ERROR_LOG_OFFSET); +- return ecclog; +- } ++ /* ++ * Quirk: The ECC_ERROR_LOG register of certain SoCs may contain ++ * the invalid value ~0. This will result in a flood of invalid ++ * error reports in polling mode. Skip it. ++ */ ++ if (ecclog == ~0) ++ return 0; + +- return 0; ++ /* Neither a CE nor a UE. Skip it.*/ ++ if (!(ecclog & (ECC_ERROR_LOG_CE | ECC_ERROR_LOG_UE))) ++ return 0; ++ ++ /* Clear CE/UE bits by writing 1s */ ++ writeq(ecclog, imc->window + ECC_ERROR_LOG_OFFSET); ++ ++ return ecclog; + } + + static void errsts_clear(struct igen6_imc *imc) diff --git a/debian/patches/patchset-zen/invlpgb-v9/0003-x86-mm-consolidate-full-flush-threshold-decision.patch b/debian/patches/patchset-zen/invlpgb-v9/0003-x86-mm-consolidate-full-flush-threshold-decision.patch deleted file mode 100644 index 70b2c27..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0003-x86-mm-consolidate-full-flush-threshold-decision.patch +++ /dev/null @@ -1,113 +0,0 @@ -From 7ac6508c4db81eced5f6e3d7c8913af1da6cf110 Mon Sep 17 00:00:00 2001 -From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:22 -0500 -Subject: x86/mm: consolidate full flush threshold decision - -Reduce code duplication by consolidating the decision point -for whether to do individual invalidations or a full flush -inside get_flush_tlb_info. - -Signed-off-by: Rik van Riel <riel@surriel.com> -Suggested-by: Dave Hansen <dave.hansen@intel.com> ---- - arch/x86/mm/tlb.c | 56 ++++++++++++++++++++++++++--------------------- - 1 file changed, 31 insertions(+), 25 deletions(-) - ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -1000,8 +1000,13 @@ static struct flush_tlb_info *get_flush_ - BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); - #endif - -- info->start = start; -- info->end = end; -+ /* -+ * Round the start and end addresses to the page size specified -+ * by the stride shift. This ensures partial pages at the end of -+ * a range get fully invalidated. -+ */ -+ info->start = round_down(start, 1 << stride_shift); -+ info->end = round_up(end, 1 << stride_shift); - info->mm = mm; - info->stride_shift = stride_shift; - info->freed_tables = freed_tables; -@@ -1009,6 +1014,19 @@ static struct flush_tlb_info *get_flush_ - info->initiating_cpu = smp_processor_id(); - info->trim_cpumask = 0; - -+ WARN_ONCE(start != info->start || end != info->end, -+ "TLB flush not stride %x aligned. Start %lx, end %lx\n", -+ 1 << stride_shift, start, end); -+ -+ /* -+ * If the number of flushes is so large that a full flush -+ * would be faster, do a full flush. -+ */ -+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) { -+ info->start = 0; -+ info->end = TLB_FLUSH_ALL; -+ } -+ - return info; - } - -@@ -1026,17 +1044,8 @@ void flush_tlb_mm_range(struct mm_struct - bool freed_tables) - { - struct flush_tlb_info *info; -+ int cpu = get_cpu(); - u64 new_tlb_gen; -- int cpu; -- -- cpu = get_cpu(); -- -- /* Should we flush just the requested range? */ -- if ((end == TLB_FLUSH_ALL) || -- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { -- start = 0; -- end = TLB_FLUSH_ALL; -- } - - /* This is also a barrier that synchronizes with switch_mm(). */ - new_tlb_gen = inc_mm_tlb_gen(mm); -@@ -1089,22 +1098,19 @@ static void do_kernel_range_flush(void * - - void flush_tlb_kernel_range(unsigned long start, unsigned long end) - { -- /* Balance as user space task's flush, a bit conservative */ -- if (end == TLB_FLUSH_ALL || -- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { -- on_each_cpu(do_flush_tlb_all, NULL, 1); -- } else { -- struct flush_tlb_info *info; -+ struct flush_tlb_info *info; -+ -+ guard(preempt)(); - -- preempt_disable(); -- info = get_flush_tlb_info(NULL, start, end, 0, false, -- TLB_GENERATION_INVALID); -+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false, -+ TLB_GENERATION_INVALID); - -+ if (info->end == TLB_FLUSH_ALL) -+ on_each_cpu(do_flush_tlb_all, NULL, 1); -+ else - on_each_cpu(do_kernel_range_flush, info, 1); - -- put_flush_tlb_info(); -- preempt_enable(); -- } -+ put_flush_tlb_info(); - } - - /* -@@ -1276,7 +1282,7 @@ void arch_tlbbatch_flush(struct arch_tlb - - int cpu = get_cpu(); - -- info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, -+ info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, PAGE_SHIFT, false, - TLB_GENERATION_INVALID); - /* - * flush_tlb_multi() is not optimized for the common case in which only diff --git a/debian/patches/patchset-zen/invlpgb-v9/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch b/debian/patches/patchset-zen/invlpgb-v9/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch deleted file mode 100644 index 0d435fd..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch +++ /dev/null @@ -1,90 +0,0 @@ -From e772b2eb66e5c3cf668feadab678f2a88d896189 Mon Sep 17 00:00:00 2001 -From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:23 -0500 -Subject: x86/mm: get INVLPGB count max from CPUID - -The CPU advertises the maximum number of pages that can be shot down -with one INVLPGB instruction in the CPUID data. - -Save that information for later use. - -Signed-off-by: Rik van Riel <riel@surriel.com> -Tested-by: Manali Shukla <Manali.Shukla@amd.com> ---- - arch/x86/Kconfig.cpu | 5 +++++ - arch/x86/include/asm/cpufeatures.h | 1 + - arch/x86/include/asm/tlbflush.h | 7 +++++++ - arch/x86/kernel/cpu/amd.c | 8 ++++++++ - 4 files changed, 21 insertions(+) - ---- a/arch/x86/Kconfig.cpu -+++ b/arch/x86/Kconfig.cpu -@@ -726,6 +726,10 @@ config X86_VMX_FEATURE_NAMES - def_bool y - depends on IA32_FEAT_CTL - -+config X86_BROADCAST_TLB_FLUSH -+ def_bool y -+ depends on CPU_SUP_AMD && 64BIT -+ - menuconfig PROCESSOR_SELECT - bool "Supported processor vendors" if EXPERT - help -@@ -762,6 +766,7 @@ config CPU_SUP_CYRIX_32 - config CPU_SUP_AMD - default y - bool "Support AMD processors" if PROCESSOR_SELECT -+ select X86_BROADCAST_TLB_FLUSH - help - This enables detection, tunings and quirks for AMD processors - ---- a/arch/x86/include/asm/cpufeatures.h -+++ b/arch/x86/include/asm/cpufeatures.h -@@ -335,6 +335,7 @@ - #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ - #define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ - #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ -+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instruction supported. */ - #define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ - #define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ - #define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -183,6 +183,13 @@ static inline void cr4_init_shadow(void) - extern unsigned long mmu_cr4_features; - extern u32 *trampoline_cr4_features; - -+/* How many pages can we invalidate with one INVLPGB. */ -+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH -+extern u16 invlpgb_count_max; -+#else -+#define invlpgb_count_max 1 -+#endif -+ - extern void initialize_tlbstate_and_flush(void); - - /* ---- a/arch/x86/kernel/cpu/amd.c -+++ b/arch/x86/kernel/cpu/amd.c -@@ -29,6 +29,8 @@ - - #include "cpu.h" - -+u16 invlpgb_count_max __ro_after_init; -+ - static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) - { - u32 gprs[8] = { 0 }; -@@ -1135,6 +1137,12 @@ static void cpu_detect_tlb_amd(struct cp - tlb_lli_2m[ENTRIES] = eax & mask; - - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; -+ -+ /* Max number of pages INVLPGB can invalidate in one shot */ -+ if (boot_cpu_has(X86_FEATURE_INVLPGB)) { -+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx); -+ invlpgb_count_max = (edx & 0xffff) + 1; -+ } - } - - static const struct cpu_dev amd_cpu_dev = { diff --git a/debian/patches/patchset-zen/invlpgb-v9/0005-x86-mm-add-INVLPGB-support-code.patch b/debian/patches/patchset-zen/invlpgb-v9/0005-x86-mm-add-INVLPGB-support-code.patch deleted file mode 100644 index 96d4c3a..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0005-x86-mm-add-INVLPGB-support-code.patch +++ /dev/null @@ -1,130 +0,0 @@ -From 7a896b12875e2b988acbf0229fb4bcf9157b83bd Mon Sep 17 00:00:00 2001 -From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:24 -0500 -Subject: x86/mm: add INVLPGB support code - -Add invlpgb.h with the helper functions and definitions needed to use -broadcast TLB invalidation on AMD EPYC 3 and newer CPUs. - -Signed-off-by: Rik van Riel <riel@surriel.com> -Tested-by: Manali Shukla <Manali.Shukla@amd.com> ---- - arch/x86/include/asm/invlpgb.h | 101 ++++++++++++++++++++++++++++++++ - arch/x86/include/asm/tlbflush.h | 1 + - 2 files changed, 102 insertions(+) - create mode 100644 arch/x86/include/asm/invlpgb.h - ---- /dev/null -+++ b/arch/x86/include/asm/invlpgb.h -@@ -0,0 +1,101 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _ASM_X86_INVLPGB -+#define _ASM_X86_INVLPGB -+ -+#include <linux/kernel.h> -+#include <vdso/bits.h> -+#include <vdso/page.h> -+ -+/* -+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. -+ * -+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can -+ * be done in a parallel fashion. -+ * -+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from -+ * this CPU have completed. -+ */ -+static inline void __invlpgb(unsigned long asid, unsigned long pcid, -+ unsigned long addr, u16 extra_count, -+ bool pmd_stride, u8 flags) -+{ -+ u32 edx = (pcid << 16) | asid; -+ u32 ecx = (pmd_stride << 31) | extra_count; -+ u64 rax = addr | flags; -+ -+ /* The low bits in rax are for flags. Verify addr is clean. */ -+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK); -+ -+ /* INVLPGB; supported in binutils >= 2.36. */ -+ asm volatile(".byte 0x0f, 0x01, 0xfe" : : "a" (rax), "c" (ecx), "d" (edx)); -+} -+ -+/* Wait for INVLPGB originated by this CPU to complete. */ -+static inline void tlbsync(void) -+{ -+ cant_migrate(); -+ /* TLBSYNC: supported in binutils >= 0.36. */ -+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); -+} -+ -+/* -+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination -+ * of the three. For example: -+ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address -+ * - INVLPGB_PCID: invalidate all TLB entries matching the PCID -+ * -+ * The first can be used to invalidate (kernel) mappings at a particular -+ * address across all processes. -+ * -+ * The latter invalidates all TLB entries matching a PCID. -+ */ -+#define INVLPGB_VA BIT(0) -+#define INVLPGB_PCID BIT(1) -+#define INVLPGB_ASID BIT(2) -+#define INVLPGB_INCLUDE_GLOBAL BIT(3) -+#define INVLPGB_FINAL_ONLY BIT(4) -+#define INVLPGB_INCLUDE_NESTED BIT(5) -+ -+/* Flush all mappings for a given pcid and addr, not including globals. */ -+static inline void invlpgb_flush_user(unsigned long pcid, -+ unsigned long addr) -+{ -+ __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA); -+ tlbsync(); -+} -+ -+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, -+ unsigned long addr, -+ u16 nr, -+ bool pmd_stride) -+{ -+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); -+} -+ -+/* Flush all mappings for a given PCID, not including globals. */ -+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) -+{ -+ __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID); -+} -+ -+/* Flush all mappings, including globals, for all PCIDs. */ -+static inline void invlpgb_flush_all(void) -+{ -+ __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL); -+ tlbsync(); -+} -+ -+/* Flush addr, including globals, for all PCIDs. */ -+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) -+{ -+ __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL); -+} -+ -+/* Flush all mappings for all PCIDs except globals. */ -+static inline void invlpgb_flush_all_nonglobals(void) -+{ -+ __invlpgb(0, 0, 0, 0, 0, 0); -+ tlbsync(); -+} -+ -+#endif /* _ASM_X86_INVLPGB */ ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -10,6 +10,7 @@ - #include <asm/cpufeature.h> - #include <asm/special_insns.h> - #include <asm/smp.h> -+#include <asm/invlpgb.h> - #include <asm/invpcid.h> - #include <asm/pti.h> - #include <asm/processor-flags.h> diff --git a/debian/patches/patchset-zen/invlpgb-v9/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch b/debian/patches/patchset-zen/invlpgb-v9/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch deleted file mode 100644 index aaada4e..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch +++ /dev/null @@ -1,59 +0,0 @@ -From 99f2b0eda74d7ec76c9c48b78f9d30d251501c28 Mon Sep 17 00:00:00 2001 -From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:25 -0500 -Subject: x86/mm: use INVLPGB for kernel TLB flushes - -Use broadcast TLB invalidation for kernel addresses when available. - -Remove the need to send IPIs for kernel TLB flushes. - -Signed-off-by: Rik van Riel <riel@surriel.com> -Tested-by: Manali Shukla <Manali.Shukla@amd.com> ---- - arch/x86/mm/tlb.c | 28 +++++++++++++++++++++++++++- - 1 file changed, 27 insertions(+), 1 deletion(-) - ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -1086,6 +1086,30 @@ void flush_tlb_all(void) - on_each_cpu(do_flush_tlb_all, NULL, 1); - } - -+static bool broadcast_kernel_range_flush(struct flush_tlb_info *info) -+{ -+ unsigned long addr; -+ unsigned long nr; -+ -+ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH)) -+ return false; -+ -+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) -+ return false; -+ -+ if (info->end == TLB_FLUSH_ALL) { -+ invlpgb_flush_all(); -+ return true; -+ } -+ -+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) { -+ nr = min((info->end - addr) >> PAGE_SHIFT, invlpgb_count_max); -+ invlpgb_flush_addr_nosync(addr, nr); -+ } -+ tlbsync(); -+ return true; -+} -+ - static void do_kernel_range_flush(void *info) - { - struct flush_tlb_info *f = info; -@@ -1105,7 +1129,9 @@ void flush_tlb_kernel_range(unsigned lon - info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false, - TLB_GENERATION_INVALID); - -- if (info->end == TLB_FLUSH_ALL) -+ if (broadcast_kernel_range_flush(info)) -+ ; /* Fall through. */ -+ else if (info->end == TLB_FLUSH_ALL) - on_each_cpu(do_flush_tlb_all, NULL, 1); - else - on_each_cpu(do_kernel_range_flush, info, 1); diff --git a/debian/patches/patchset-zen/invlpgb-v9/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch b/debian/patches/patchset-zen/invlpgb-v9/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch deleted file mode 100644 index b9742aa..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch +++ /dev/null @@ -1,45 +0,0 @@ -From 1ef7edb5b2375d4010ed2ad0c7d87fcfa7ab4519 Mon Sep 17 00:00:00 2001 -From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:26 -0500 -Subject: x86/mm: use INVLPGB in flush_tlb_all - -The flush_tlb_all() function is not used a whole lot, but we might -as well use broadcast TLB flushing there, too. - -Signed-off-by: Rik van Riel <riel@surriel.com> -Tested-by: Manali Shukla <Manali.Shukla@amd.com> ---- - arch/x86/mm/tlb.c | 15 +++++++++++++++ - 1 file changed, 15 insertions(+) - ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -1074,6 +1074,19 @@ void flush_tlb_mm_range(struct mm_struct - } - - -+static bool broadcast_flush_tlb_all(void) -+{ -+ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH)) -+ return false; -+ -+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) -+ return false; -+ -+ guard(preempt)(); -+ invlpgb_flush_all(); -+ return true; -+} -+ - static void do_flush_tlb_all(void *info) - { - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); -@@ -1082,6 +1095,8 @@ static void do_flush_tlb_all(void *info) - - void flush_tlb_all(void) - { -+ if (broadcast_flush_tlb_all()) -+ return; - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); - on_each_cpu(do_flush_tlb_all, NULL, 1); - } diff --git a/debian/patches/patchset-zen/invlpgb-v9/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch b/debian/patches/patchset-zen/invlpgb-v9/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch deleted file mode 100644 index fc83b7c..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch +++ /dev/null @@ -1,603 +0,0 @@ -From c7212dc64d8e9e4f12f1c6edea3b75c350a30381 Mon Sep 17 00:00:00 2001 -From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:28 -0500 -Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded - processes - -Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3 -and newer CPUs. - -In order to not exhaust PCID space, and keep TLB flushes local for single -threaded processes, we only hand out broadcast ASIDs to processes active on -4 or more CPUs. - -Signed-off-by: Rik van Riel <riel@surriel.com> -Tested-by: Manali Shukla <Manali.Shukla@amd.com> ---- - arch/x86/include/asm/mmu.h | 6 + - arch/x86/include/asm/mmu_context.h | 14 ++ - arch/x86/include/asm/tlbflush.h | 73 ++++++ - arch/x86/mm/tlb.c | 344 ++++++++++++++++++++++++++++- - 4 files changed, 425 insertions(+), 12 deletions(-) - ---- a/arch/x86/include/asm/mmu.h -+++ b/arch/x86/include/asm/mmu.h -@@ -69,6 +69,12 @@ typedef struct { - u16 pkey_allocation_map; - s16 execute_only_pkey; - #endif -+ -+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH -+ u16 global_asid; -+ bool asid_transition; -+#endif -+ - } mm_context_t; - - #define INIT_MM_CONTEXT(mm) \ ---- a/arch/x86/include/asm/mmu_context.h -+++ b/arch/x86/include/asm/mmu_context.h -@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s - #define enter_lazy_tlb enter_lazy_tlb - extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); - -+extern void destroy_context_free_global_asid(struct mm_struct *mm); -+ - /* - * Init a new mm. Used on mm copies, like at fork() - * and on mm's that are brand-new, like at execve(). -@@ -161,6 +163,14 @@ static inline int init_new_context(struc - mm->context.execute_only_pkey = -1; - } - #endif -+ -+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH -+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { -+ mm->context.global_asid = 0; -+ mm->context.asid_transition = false; -+ } -+#endif -+ - mm_reset_untag_mask(mm); - init_new_context_ldt(mm); - return 0; -@@ -170,6 +180,10 @@ static inline int init_new_context(struc - static inline void destroy_context(struct mm_struct *mm) - { - destroy_context_ldt(mm); -+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH -+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) -+ destroy_context_free_global_asid(mm); -+#endif - } - - extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -6,6 +6,7 @@ - #include <linux/mmu_notifier.h> - #include <linux/sched.h> - -+#include <asm/barrier.h> - #include <asm/processor.h> - #include <asm/cpufeature.h> - #include <asm/special_insns.h> -@@ -239,6 +240,78 @@ void flush_tlb_one_kernel(unsigned long - void flush_tlb_multi(const struct cpumask *cpumask, - const struct flush_tlb_info *info); - -+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH -+static inline bool is_dyn_asid(u16 asid) -+{ -+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) -+ return true; -+ -+ return asid < TLB_NR_DYN_ASIDS; -+} -+ -+static inline bool is_global_asid(u16 asid) -+{ -+ return !is_dyn_asid(asid); -+} -+ -+static inline bool in_asid_transition(struct mm_struct *mm) -+{ -+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) -+ return false; -+ -+ return mm && READ_ONCE(mm->context.asid_transition); -+} -+ -+static inline u16 mm_global_asid(struct mm_struct *mm) -+{ -+ u16 asid; -+ -+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) -+ return 0; -+ -+ asid = smp_load_acquire(&mm->context.global_asid); -+ -+ /* mm->context.global_asid is either 0, or a global ASID */ -+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid)); -+ -+ return asid; -+} -+#else -+static inline bool is_dyn_asid(u16 asid) -+{ -+ return true; -+} -+ -+static inline bool is_global_asid(u16 asid) -+{ -+ return false; -+} -+ -+static inline bool in_asid_transition(struct mm_struct *mm) -+{ -+ return false; -+} -+ -+static inline u16 mm_global_asid(struct mm_struct *mm) -+{ -+ return 0; -+} -+ -+static inline bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid) -+{ -+ return false; -+} -+ -+static inline void broadcast_tlb_flush(struct flush_tlb_info *info) -+{ -+ VM_WARN_ON_ONCE(1); -+} -+ -+static inline void consider_global_asid(struct mm_struct *mm) -+{ -+} -+#endif -+ - #ifdef CONFIG_PARAVIRT - #include <asm/paravirt.h> - #endif ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -74,13 +74,15 @@ - * use different names for each of them: - * - * ASID - [0, TLB_NR_DYN_ASIDS-1] -- * the canonical identifier for an mm -+ * the canonical identifier for an mm, dynamically allocated on each CPU -+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1] -+ * the canonical, global identifier for an mm, identical across all CPUs - * -- * kPCID - [1, TLB_NR_DYN_ASIDS] -+ * kPCID - [1, MAX_ASID_AVAILABLE] - * the value we write into the PCID part of CR3; corresponds to the - * ASID+1, because PCID 0 is special. - * -- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] -+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE] - * for KPTI each mm has two address spaces and thus needs two - * PCID values, but we can still do with a single ASID denomination - * for each mm. Corresponds to kPCID + 2048. -@@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_st - return; - } - -+ /* -+ * TLB consistency for global ASIDs is maintained with broadcast TLB -+ * flushing. The TLB is never outdated, and does not need flushing. -+ */ -+ if (IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH) && static_cpu_has(X86_FEATURE_INVLPGB)) { -+ u16 global_asid = mm_global_asid(next); -+ -+ if (global_asid) { -+ *new_asid = global_asid; -+ *need_flush = false; -+ return; -+ } -+ } -+ - if (this_cpu_read(cpu_tlbstate.invalidate_other)) - clear_asid_other(); - -@@ -251,6 +267,272 @@ static void choose_new_asid(struct mm_st - *need_flush = true; - } - -+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH -+/* -+ * Logic for broadcast TLB invalidation. -+ */ -+static DEFINE_RAW_SPINLOCK(global_asid_lock); -+static u16 last_global_asid = MAX_ASID_AVAILABLE; -+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE) = { 0 }; -+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE) = { 0 }; -+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1; -+ -+static void reset_global_asid_space(void) -+{ -+ lockdep_assert_held(&global_asid_lock); -+ -+ /* -+ * A global TLB flush guarantees that any stale entries from -+ * previously freed global ASIDs get flushed from the TLB -+ * everywhere, making these global ASIDs safe to reuse. -+ */ -+ invlpgb_flush_all_nonglobals(); -+ -+ /* -+ * Clear all the previously freed global ASIDs from the -+ * broadcast_asid_used bitmap, now that the global TLB flush -+ * has made them actually available for re-use. -+ */ -+ bitmap_andnot(global_asid_used, global_asid_used, -+ global_asid_freed, MAX_ASID_AVAILABLE); -+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE); -+ -+ /* -+ * ASIDs 0-TLB_NR_DYN_ASIDS are used for CPU-local ASID -+ * assignments, for tasks doing IPI based TLB shootdowns. -+ * Restart the search from the start of the global ASID space. -+ */ -+ last_global_asid = TLB_NR_DYN_ASIDS; -+} -+ -+static u16 get_global_asid(void) -+{ -+ -+ u16 asid; -+ -+ lockdep_assert_held(&global_asid_lock); -+ -+ /* The previous allocated ASID is at the top of the address space. */ -+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1) -+ reset_global_asid_space(); -+ -+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid); -+ -+ if (asid >= MAX_ASID_AVAILABLE) { -+ /* This should never happen. */ -+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", global_asid_available); -+ return 0; -+ } -+ -+ /* Claim this global ASID. */ -+ __set_bit(asid, global_asid_used); -+ last_global_asid = asid; -+ global_asid_available--; -+ return asid; -+} -+ -+/* -+ * Returns true if the mm is transitioning from a CPU-local ASID to a global -+ * (INVLPGB) ASID, or the other way around. -+ */ -+static bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid) -+{ -+ u16 global_asid = mm_global_asid(next); -+ -+ if (global_asid && prev_asid != global_asid) -+ return true; -+ -+ if (!global_asid && is_global_asid(prev_asid)) -+ return true; -+ -+ return false; -+} -+ -+void destroy_context_free_global_asid(struct mm_struct *mm) -+{ -+ if (!mm->context.global_asid) -+ return; -+ -+ guard(raw_spinlock_irqsave)(&global_asid_lock); -+ -+ /* The global ASID can be re-used only after flush at wrap-around. */ -+ __set_bit(mm->context.global_asid, global_asid_freed); -+ -+ mm->context.global_asid = 0; -+ global_asid_available++; -+} -+ -+/* -+ * Check whether a process is currently active on more than "threshold" CPUs. -+ * This is a cheap estimation on whether or not it may make sense to assign -+ * a global ASID to this process, and use broadcast TLB invalidation. -+ */ -+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold) -+{ -+ int count = 0; -+ int cpu; -+ -+ /* This quick check should eliminate most single threaded programs. */ -+ if (cpumask_weight(mm_cpumask(mm)) <= threshold) -+ return false; -+ -+ /* Slower check to make sure. */ -+ for_each_cpu(cpu, mm_cpumask(mm)) { -+ /* Skip the CPUs that aren't really running this process. */ -+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm) -+ continue; -+ -+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) -+ continue; -+ -+ if (++count > threshold) -+ return true; -+ } -+ return false; -+} -+ -+/* -+ * Assign a global ASID to the current process, protecting against -+ * races between multiple threads in the process. -+ */ -+static void use_global_asid(struct mm_struct *mm) -+{ -+ u16 asid; -+ -+ guard(raw_spinlock_irqsave)(&global_asid_lock); -+ -+ /* This process is already using broadcast TLB invalidation. */ -+ if (mm->context.global_asid) -+ return; -+ -+ /* The last global ASID was consumed while waiting for the lock. */ -+ if (!global_asid_available) { -+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n"); -+ return; -+ } -+ -+ asid = get_global_asid(); -+ if (!asid) -+ return; -+ -+ /* -+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() -> -+ * finish_asid_transition() needs to observe asid_transition = true -+ * once it observes global_asid. -+ */ -+ mm->context.asid_transition = true; -+ smp_store_release(&mm->context.global_asid, asid); -+} -+ -+static bool meets_global_asid_threshold(struct mm_struct *mm) -+{ -+ if (!global_asid_available) -+ return false; -+ -+ /* -+ * Assign a global ASID if the process is active on -+ * 4 or more CPUs simultaneously. -+ */ -+ return mm_active_cpus_exceeds(mm, 3); -+} -+ -+static void consider_global_asid(struct mm_struct *mm) -+{ -+ if (!static_cpu_has(X86_FEATURE_INVLPGB)) -+ return; -+ -+ /* Check every once in a while. */ -+ if ((current->pid & 0x1f) != (jiffies & 0x1f)) -+ return; -+ -+ if (meets_global_asid_threshold(mm)) -+ use_global_asid(mm); -+} -+ -+static void finish_asid_transition(struct flush_tlb_info *info) -+{ -+ struct mm_struct *mm = info->mm; -+ int bc_asid = mm_global_asid(mm); -+ int cpu; -+ -+ if (!READ_ONCE(mm->context.asid_transition)) -+ return; -+ -+ for_each_cpu(cpu, mm_cpumask(mm)) { -+ /* -+ * The remote CPU is context switching. Wait for that to -+ * finish, to catch the unlikely case of it switching to -+ * the target mm with an out of date ASID. -+ */ -+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING) -+ cpu_relax(); -+ -+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm) -+ continue; -+ -+ /* -+ * If at least one CPU is not using the global ASID yet, -+ * send a TLB flush IPI. The IPI should cause stragglers -+ * to transition soon. -+ * -+ * This can race with the CPU switching to another task; -+ * that results in a (harmless) extra IPI. -+ */ -+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) { -+ flush_tlb_multi(mm_cpumask(info->mm), info); -+ return; -+ } -+ } -+ -+ /* All the CPUs running this process are using the global ASID. */ -+ WRITE_ONCE(mm->context.asid_transition, false); -+} -+ -+static void broadcast_tlb_flush(struct flush_tlb_info *info) -+{ -+ bool pmd = info->stride_shift == PMD_SHIFT; -+ unsigned long maxnr = invlpgb_count_max; -+ unsigned long asid = info->mm->context.global_asid; -+ unsigned long addr = info->start; -+ unsigned long nr; -+ -+ /* Flushing multiple pages at once is not supported with 1GB pages. */ -+ if (info->stride_shift > PMD_SHIFT) -+ maxnr = 1; -+ -+ /* -+ * TLB flushes with INVLPGB are kicked off asynchronously. -+ * The inc_mm_tlb_gen() guarantees page table updates are done -+ * before these TLB flushes happen. -+ */ -+ if (info->end == TLB_FLUSH_ALL) { -+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid)); -+ /* Do any CPUs supporting INVLPGB need PTI? */ -+ if (static_cpu_has(X86_FEATURE_PTI)) -+ invlpgb_flush_single_pcid_nosync(user_pcid(asid)); -+ } else do { -+ /* -+ * Calculate how many pages can be flushed at once; if the -+ * remainder of the range is less than one page, flush one. -+ */ -+ nr = min(maxnr, (info->end - addr) >> info->stride_shift); -+ nr = max(nr, 1); -+ -+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); -+ /* Do any CPUs supporting INVLPGB need PTI? */ -+ if (static_cpu_has(X86_FEATURE_PTI)) -+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd); -+ -+ addr += nr << info->stride_shift; -+ } while (addr < info->end); -+ -+ finish_asid_transition(info); -+ -+ /* Wait for the INVLPGBs kicked off above to finish. */ -+ tlbsync(); -+} -+#endif /* CONFIG_X86_BROADCAST_TLB_FLUSH */ -+ - /* - * Given an ASID, flush the corresponding user ASID. We can delay this - * until the next time we switch to it. -@@ -556,8 +838,9 @@ void switch_mm_irqs_off(struct mm_struct - */ - if (prev == next) { - /* Not actually switching mm's */ -- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != -- next->context.ctx_id); -+ VM_WARN_ON(is_dyn_asid(prev_asid) && -+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != -+ next->context.ctx_id); - - /* - * If this races with another thread that enables lam, 'new_lam' -@@ -574,6 +857,23 @@ void switch_mm_irqs_off(struct mm_struct - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* -+ * Check if the current mm is transitioning to a new ASID. -+ */ -+ if (needs_global_asid_reload(next, prev_asid)) { -+ next_tlb_gen = atomic64_read(&next->context.tlb_gen); -+ -+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); -+ goto reload_tlb; -+ } -+ -+ /* -+ * Broadcast TLB invalidation keeps this PCID up to date -+ * all the time. -+ */ -+ if (is_global_asid(prev_asid)) -+ return; -+ -+ /* - * If the CPU is not in lazy TLB mode, we are just switching - * from one thread in a process to another thread in the same - * process. No TLB flush required. -@@ -607,6 +907,13 @@ void switch_mm_irqs_off(struct mm_struct - cond_mitigation(tsk); - - /* -+ * Let nmi_uaccess_okay() and finish_asid_transition() -+ * know that we're changing CR3. -+ */ -+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); -+ barrier(); -+ -+ /* - * Stop remote flushes for the previous mm. - * Skip kernel threads; we never send init_mm TLB flushing IPIs, - * but the bitmap manipulation can cause cache line contention. -@@ -623,14 +930,12 @@ void switch_mm_irqs_off(struct mm_struct - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - - choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); -- -- /* Let nmi_uaccess_okay() know that we're changing CR3. */ -- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); -- barrier(); - } - -+reload_tlb: - new_lam = mm_lam_cr3_mask(next); - if (need_flush) { -+ VM_WARN_ON_ONCE(is_global_asid(new_asid)); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, new_lam, true); -@@ -749,7 +1054,7 @@ static void flush_tlb_func(void *info) - const struct flush_tlb_info *f = info; - struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); - u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); -- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); -+ u64 local_tlb_gen; - bool local = smp_processor_id() == f->initiating_cpu; - unsigned long nr_invalidate = 0; - u64 mm_tlb_gen; -@@ -769,6 +1074,16 @@ static void flush_tlb_func(void *info) - if (unlikely(loaded_mm == &init_mm)) - return; - -+ /* Reload the ASID if transitioning into or out of a global ASID */ -+ if (needs_global_asid_reload(loaded_mm, loaded_mm_asid)) { -+ switch_mm_irqs_off(NULL, loaded_mm, NULL); -+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); -+ } -+ -+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */ -+ if (is_global_asid(loaded_mm_asid)) -+ return; -+ - VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != - loaded_mm->context.ctx_id); - -@@ -786,6 +1101,8 @@ static void flush_tlb_func(void *info) - return; - } - -+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); -+ - if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && - f->new_tlb_gen <= local_tlb_gen)) { - /* -@@ -953,7 +1270,7 @@ STATIC_NOPV void native_flush_tlb_multi( - * up on the new contents of what used to be page tables, while - * doing a speculative memory access. - */ -- if (info->freed_tables) -+ if (info->freed_tables || in_asid_transition(info->mm)) - on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); - else - on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, -@@ -1058,9 +1375,12 @@ void flush_tlb_mm_range(struct mm_struct - * a local TLB flush is needed. Optimize this use-case by calling - * flush_tlb_func_local() directly in this case. - */ -- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { -+ if (mm_global_asid(mm)) { -+ broadcast_tlb_flush(info); -+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { - info->trim_cpumask = should_trim_cpumask(mm); - flush_tlb_multi(mm_cpumask(mm), info); -+ consider_global_asid(mm); - } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { - lockdep_assert_irqs_enabled(); - local_irq_disable(); diff --git a/debian/patches/patchset-zen/invlpgb-v9/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch b/debian/patches/patchset-zen/invlpgb-v9/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch deleted file mode 100644 index 4f12a20..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch +++ /dev/null @@ -1,251 +0,0 @@ -From 6f601cdcd33be8fc0da98c6bab777575af3260b8 Mon Sep 17 00:00:00 2001 -From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:29 -0500 -Subject: x86/mm: do targeted broadcast flushing from tlbbatch code - -Instead of doing a system-wide TLB flush from arch_tlbbatch_flush, -queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending. - -This also allows us to avoid adding the CPUs of processes using broadcast -flushing to the batch->cpumask, and will hopefully further reduce TLB -flushing from the reclaim and compaction paths. - -Signed-off-by: Rik van Riel <riel@surriel.com> -Tested-by: Manali Shukla <Manali.Shukla@amd.com> ---- - arch/x86/include/asm/invlpgb.h | 21 +++++---- - arch/x86/include/asm/tlbflush.h | 17 ++++--- - arch/x86/mm/tlb.c | 80 +++++++++++++++++++++++++++++++-- - 3 files changed, 95 insertions(+), 23 deletions(-) - ---- a/arch/x86/include/asm/invlpgb.h -+++ b/arch/x86/include/asm/invlpgb.h -@@ -31,9 +31,8 @@ static inline void __invlpgb(unsigned lo - } - - /* Wait for INVLPGB originated by this CPU to complete. */ --static inline void tlbsync(void) -+static inline void __tlbsync(void) - { -- cant_migrate(); - /* TLBSYNC: supported in binutils >= 0.36. */ - asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); - } -@@ -61,19 +60,19 @@ static inline void invlpgb_flush_user(un - unsigned long addr) - { - __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA); -- tlbsync(); -+ __tlbsync(); - } - --static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, -- unsigned long addr, -- u16 nr, -- bool pmd_stride) -+static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid, -+ unsigned long addr, -+ u16 nr, -+ bool pmd_stride) - { - __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); - } - - /* Flush all mappings for a given PCID, not including globals. */ --static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) -+static inline void __invlpgb_flush_single_pcid_nosync(unsigned long pcid) - { - __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID); - } -@@ -82,11 +81,11 @@ static inline void invlpgb_flush_single_ - static inline void invlpgb_flush_all(void) - { - __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL); -- tlbsync(); -+ __tlbsync(); - } - - /* Flush addr, including globals, for all PCIDs. */ --static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) -+static inline void __invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) - { - __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL); - } -@@ -95,7 +94,7 @@ static inline void invlpgb_flush_addr_no - static inline void invlpgb_flush_all_nonglobals(void) - { - __invlpgb(0, 0, 0, 0, 0, 0); -- tlbsync(); -+ __tlbsync(); - } - - #endif /* _ASM_X86_INVLPGB */ ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -106,6 +106,7 @@ struct tlb_state { - * need to be invalidated. - */ - bool invalidate_other; -+ bool need_tlbsync; - - #ifdef CONFIG_ADDRESS_MASKING - /* -@@ -310,6 +311,10 @@ static inline void broadcast_tlb_flush(s - static inline void consider_global_asid(struct mm_struct *mm) - { - } -+ -+static inline void tlbsync(void) -+{ -+} - #endif - - #ifdef CONFIG_PARAVIRT -@@ -359,21 +364,15 @@ static inline u64 inc_mm_tlb_gen(struct - return atomic64_inc_return(&mm->context.tlb_gen); - } - --static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, -- struct mm_struct *mm, -- unsigned long uaddr) --{ -- inc_mm_tlb_gen(mm); -- cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); -- mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); --} -- - static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) - { - flush_tlb_mm(mm); - } - - extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); -+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, -+ struct mm_struct *mm, -+ unsigned long uaddr); - - static inline bool pte_flags_need_flush(unsigned long oldflags, - unsigned long newflags, ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -488,6 +488,37 @@ static void finish_asid_transition(struc - WRITE_ONCE(mm->context.asid_transition, false); - } - -+static inline void tlbsync(void) -+{ -+ if (!this_cpu_read(cpu_tlbstate.need_tlbsync)) -+ return; -+ __tlbsync(); -+ this_cpu_write(cpu_tlbstate.need_tlbsync, false); -+} -+ -+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, -+ unsigned long addr, -+ u16 nr, bool pmd_stride) -+{ -+ __invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride); -+ if (!this_cpu_read(cpu_tlbstate.need_tlbsync)) -+ this_cpu_write(cpu_tlbstate.need_tlbsync, true); -+} -+ -+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) -+{ -+ __invlpgb_flush_single_pcid_nosync(pcid); -+ if (!this_cpu_read(cpu_tlbstate.need_tlbsync)) -+ this_cpu_write(cpu_tlbstate.need_tlbsync, true); -+} -+ -+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) -+{ -+ __invlpgb_flush_addr_nosync(addr, nr); -+ if (!this_cpu_read(cpu_tlbstate.need_tlbsync)) -+ this_cpu_write(cpu_tlbstate.need_tlbsync, true); -+} -+ - static void broadcast_tlb_flush(struct flush_tlb_info *info) - { - bool pmd = info->stride_shift == PMD_SHIFT; -@@ -794,6 +825,8 @@ void switch_mm_irqs_off(struct mm_struct - if (IS_ENABLED(CONFIG_PROVE_LOCKING)) - WARN_ON_ONCE(!irqs_disabled()); - -+ tlbsync(); -+ - /* - * Verify that CR3 is what we think it is. This will catch - * hypothetical buggy code that directly switches to swapper_pg_dir -@@ -976,6 +1009,8 @@ reload_tlb: - */ - void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) - { -+ tlbsync(); -+ - if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) - return; - -@@ -1650,9 +1685,7 @@ void arch_tlbbatch_flush(struct arch_tlb - * a local TLB flush is needed. Optimize this use-case by calling - * flush_tlb_func_local() directly in this case. - */ -- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { -- invlpgb_flush_all_nonglobals(); -- } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { -+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { - flush_tlb_multi(&batch->cpumask, info); - } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { - lockdep_assert_irqs_enabled(); -@@ -1661,12 +1694,53 @@ void arch_tlbbatch_flush(struct arch_tlb - local_irq_enable(); - } - -+ /* -+ * If we issued (asynchronous) INVLPGB flushes, wait for them here. -+ * The cpumask above contains only CPUs that were running tasks -+ * not using broadcast TLB flushing. -+ */ -+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) -+ tlbsync(); -+ - cpumask_clear(&batch->cpumask); - - put_flush_tlb_info(); - put_cpu(); - } - -+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, -+ struct mm_struct *mm, -+ unsigned long uaddr) -+{ -+ u16 asid = mm_global_asid(mm); -+ -+ if (asid) { -+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false); -+ /* Do any CPUs supporting INVLPGB need PTI? */ -+ if (static_cpu_has(X86_FEATURE_PTI)) -+ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false); -+ -+ /* -+ * Some CPUs might still be using a local ASID for this -+ * process, and require IPIs, while others are using the -+ * global ASID. -+ * -+ * In this corner case we need to do both the broadcast -+ * TLB invalidation, and send IPIs. The IPIs will help -+ * stragglers transition to the broadcast ASID. -+ */ -+ if (in_asid_transition(mm)) -+ asid = 0; -+ } -+ -+ if (!asid) { -+ inc_mm_tlb_gen(mm); -+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); -+ } -+ -+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); -+} -+ - /* - * Blindly accessing user memory from NMI context can be dangerous - * if we're in the middle of switching the current user task or diff --git a/debian/patches/patchset-zen/invlpgb-v9/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch b/debian/patches/patchset-zen/invlpgb-v9/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch deleted file mode 100644 index 5676a23..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch +++ /dev/null @@ -1,80 +0,0 @@ -From 7b8ef03b059bca98d2af696c3ec2adcaa673f7e4 Mon Sep 17 00:00:00 2001 -From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:31 -0500 -Subject: x86/mm: only invalidate final translations with INVLPGB - -Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB. -This way only leaf mappings get removed from the TLB, leaving intermediate -translations cached. - -On the (rare) occasions where we free page tables we do a full flush, -ensuring intermediate translations get flushed from the TLB. - -Signed-off-by: Rik van Riel <riel@surriel.com> -Tested-by: Manali Shukla <Manali.Shukla@amd.com> ---- - arch/x86/include/asm/invlpgb.h | 10 ++++++++-- - arch/x86/mm/tlb.c | 13 +++++++------ - 2 files changed, 15 insertions(+), 8 deletions(-) - ---- a/arch/x86/include/asm/invlpgb.h -+++ b/arch/x86/include/asm/invlpgb.h -@@ -66,9 +66,15 @@ static inline void invlpgb_flush_user(un - static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid, - unsigned long addr, - u16 nr, -- bool pmd_stride) -+ bool pmd_stride, -+ bool freed_tables) - { -- __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); -+ u8 flags = INVLPGB_PCID | INVLPGB_VA; -+ -+ if (!freed_tables) -+ flags |= INVLPGB_FINAL_ONLY; -+ -+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, flags); - } - - /* Flush all mappings for a given PCID, not including globals. */ ---- a/arch/x86/mm/tlb.c -+++ b/arch/x86/mm/tlb.c -@@ -498,9 +498,10 @@ static inline void tlbsync(void) - - static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, - unsigned long addr, -- u16 nr, bool pmd_stride) -+ u16 nr, bool pmd_stride, -+ bool freed_tables) - { -- __invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride); -+ __invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride, freed_tables); - if (!this_cpu_read(cpu_tlbstate.need_tlbsync)) - this_cpu_write(cpu_tlbstate.need_tlbsync, true); - } -@@ -549,10 +550,10 @@ static void broadcast_tlb_flush(struct f - nr = min(maxnr, (info->end - addr) >> info->stride_shift); - nr = max(nr, 1); - -- invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); -+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd, info->freed_tables); - /* Do any CPUs supporting INVLPGB need PTI? */ - if (static_cpu_has(X86_FEATURE_PTI)) -- invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd); -+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd, info->freed_tables); - - addr += nr << info->stride_shift; - } while (addr < info->end); -@@ -1715,10 +1716,10 @@ void arch_tlbbatch_add_pending(struct ar - u16 asid = mm_global_asid(mm); - - if (asid) { -- invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false); -+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false, false); - /* Do any CPUs supporting INVLPGB need PTI? */ - if (static_cpu_has(X86_FEATURE_PTI)) -- invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false); -+ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false, false); - - /* - * Some CPUs might still be using a local ASID for this diff --git a/debian/patches/patchset-zen/invlpgb-v9/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch b/debian/patches/patchset-zen/invlpgb-v9/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch deleted file mode 100644 index 7b169f4..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 7b0836fcad644d24d6318bf63013ec1b35d6a27b Mon Sep 17 00:00:00 2001 -From: Rik van Riel <riel@surriel.com> -Date: Thu, 19 Dec 2024 15:32:53 -0500 -Subject: mm: remove unnecessary calls to lru_add_drain - -There seem to be several categories of calls to lru_add_drain -and lru_add_drain_all. - -The first are code paths that recently allocated, swapped in, -or otherwise processed a batch of pages, and want them all on -the LRU. These drain pages that were recently allocated, -probably on the local CPU. - -A second category are code paths that are actively trying to -reclaim, migrate, or offline memory. These often use lru_add_drain_all, -to drain the caches on all CPUs. - -However, there also seem to be some other callers where we -aren't really doing either. They are calling lru_add_drain(), -despite operating on pages that may have been allocated -long ago, and quite possibly on different CPUs. - -Those calls are not likely to be effective at anything but -creating lock contention on the LRU locks. - -Remove the lru_add_drain calls in the latter category. - -Signed-off-by: Rik van Riel <riel@surriel.com> -Suggested-by: David Hildenbrand <david@redhat.com> -Acked-by: Shakeel Butt <shakeel.butt@linux.dev> -Acked-by: David Hildenbrand <david@redhat.com> ---- - mm/memory.c | 1 - - mm/mmap.c | 2 -- - mm/swap_state.c | 1 - - mm/vma.c | 2 -- - 4 files changed, 6 deletions(-) - ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are - struct mmu_notifier_range range; - struct mmu_gather tlb; - -- lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, - address, end); - hugetlb_zap_begin(vma, &range.start, &range.end); ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -1931,7 +1931,6 @@ void exit_mmap(struct mm_struct *mm) - goto destroy; - } - -- lru_add_drain(); - flush_cache_mm(mm); - tlb_gather_mmu_fullmm(&tlb, mm); - /* update_hiwater_rss(mm) here? but nobody should be looking */ -@@ -2374,7 +2373,6 @@ int relocate_vma_down(struct vm_area_str - vma, new_start, length, false, true)) - return -ENOMEM; - -- lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - next = vma_next(&vmi); - if (new_end > old_start) { ---- a/mm/swap_state.c -+++ b/mm/swap_state.c -@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en - struct folio_batch folios; - unsigned int refs[PAGEVEC_SIZE]; - -- lru_add_drain(); - folio_batch_init(&folios); - for (int i = 0; i < nr; i++) { - struct folio *folio = page_folio(encoded_page_ptr(pages[i])); ---- a/mm/vma.c -+++ b/mm/vma.c -@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas, - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather tlb; - -- lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, -@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct - * were isolated before we downgraded mmap_lock. - */ - mas_set(mas_detach, 1); -- lru_add_drain(); - tlb_gather_mmu(&tlb, vms->vma->vm_mm); - update_hiwater_rss(vms->vma->vm_mm); - unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, diff --git a/debian/patches/patchset-zen/invlpgb-v9/0014-vdso-Introduce-vdso-page.h.patch b/debian/patches/patchset-zen/invlpgb-v9/0014-vdso-Introduce-vdso-page.h.patch deleted file mode 100644 index 0823215..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0014-vdso-Introduce-vdso-page.h.patch +++ /dev/null @@ -1,429 +0,0 @@ -From 7ecab5a83d3155baa009cd6bc6e18959fee8be62 Mon Sep 17 00:00:00 2001 -From: Vincenzo Frascino <vincenzo.frascino@arm.com> -Date: Mon, 14 Oct 2024 16:13:39 +0100 -Subject: vdso: Introduce vdso/page.h - -The VDSO implementation includes headers from outside of the -vdso/ namespace. - -Introduce vdso/page.h to make sure that the generic library -uses only the allowed namespace. - -Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com> -Signed-off-by: Thomas Gleixner <tglx@linutronix.de> -Reviewed-by: Arnd Bergmann <arnd@arndb.de> -Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k -Link: https://lore.kernel.org/all/20241014151340.1639555-3-vincenzo.frascino@arm.com ---- - arch/alpha/include/asm/page.h | 6 +----- - arch/arc/include/uapi/asm/page.h | 7 +++---- - arch/arm/include/asm/page.h | 5 +---- - arch/arm64/include/asm/page-def.h | 5 +---- - arch/csky/include/asm/page.h | 8 ++------ - arch/hexagon/include/asm/page.h | 4 +--- - arch/loongarch/include/asm/page.h | 7 +------ - arch/m68k/include/asm/page.h | 6 ++---- - arch/microblaze/include/asm/page.h | 5 +---- - arch/mips/include/asm/page.h | 7 +------ - arch/nios2/include/asm/page.h | 7 +------ - arch/openrisc/include/asm/page.h | 11 +---------- - arch/parisc/include/asm/page.h | 4 +--- - arch/powerpc/include/asm/page.h | 10 +--------- - arch/riscv/include/asm/page.h | 4 +--- - arch/s390/include/asm/page.h | 13 +++++-------- - arch/sh/include/asm/page.h | 6 ++---- - arch/sparc/include/asm/page_32.h | 4 +--- - arch/sparc/include/asm/page_64.h | 4 +--- - arch/um/include/asm/page.h | 5 +---- - arch/x86/include/asm/page_types.h | 5 +---- - arch/xtensa/include/asm/page.h | 8 +------- - include/vdso/page.h | 30 ++++++++++++++++++++++++++++++ - 23 files changed, 61 insertions(+), 110 deletions(-) - create mode 100644 include/vdso/page.h - ---- a/arch/alpha/include/asm/page.h -+++ b/arch/alpha/include/asm/page.h -@@ -4,11 +4,7 @@ - - #include <linux/const.h> - #include <asm/pal.h> -- --/* PAGE_SHIFT determines the page size */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - #ifndef __ASSEMBLY__ - ---- a/arch/arc/include/uapi/asm/page.h -+++ b/arch/arc/include/uapi/asm/page.h -@@ -14,7 +14,7 @@ - - /* PAGE_SHIFT determines the page size */ - #ifdef __KERNEL__ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT -+#include <vdso/page.h> - #else - /* - * Default 8k -@@ -24,11 +24,10 @@ - * not available - */ - #define PAGE_SHIFT 13 -+#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */ -+#define PAGE_MASK (~(PAGE_SIZE-1)) - #endif - --#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */ - #define PAGE_OFFSET _AC(0x80000000, UL) /* Kernel starts at 2G onwrds */ - --#define PAGE_MASK (~(PAGE_SIZE-1)) -- - #endif /* _UAPI__ASM_ARC_PAGE_H */ ---- a/arch/arm/include/asm/page.h -+++ b/arch/arm/include/asm/page.h -@@ -7,10 +7,7 @@ - #ifndef _ASMARM_PAGE_H - #define _ASMARM_PAGE_H - --/* PAGE_SHIFT determines the page size */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) --#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) -+#include <vdso/page.h> - - #ifndef __ASSEMBLY__ - ---- a/arch/arm64/include/asm/page-def.h -+++ b/arch/arm64/include/asm/page-def.h -@@ -10,9 +10,6 @@ - - #include <linux/const.h> - --/* PAGE_SHIFT determines the page size */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - #endif /* __ASM_PAGE_DEF_H */ ---- a/arch/csky/include/asm/page.h -+++ b/arch/csky/include/asm/page.h -@@ -7,12 +7,8 @@ - #include <asm/cache.h> - #include <linux/const.h> - --/* -- * PAGE_SHIFT determines the page size: 4KB -- */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE - 1)) -+#include <vdso/page.h> -+ - #define THREAD_SIZE (PAGE_SIZE * 2) - #define THREAD_MASK (~(THREAD_SIZE - 1)) - #define THREAD_SHIFT (PAGE_SHIFT + 1) ---- a/arch/hexagon/include/asm/page.h -+++ b/arch/hexagon/include/asm/page.h -@@ -45,9 +45,7 @@ - #define HVM_HUGEPAGE_SIZE 0x5 - #endif - --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (1UL << PAGE_SHIFT) --#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) -+#include <vdso/page.h> - - #ifdef __KERNEL__ - #ifndef __ASSEMBLY__ ---- a/arch/loongarch/include/asm/page.h -+++ b/arch/loongarch/include/asm/page.h -@@ -8,12 +8,7 @@ - #include <linux/const.h> - #include <asm/addrspace.h> - --/* -- * PAGE_SHIFT determines the page size -- */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE - 1)) -+#include <vdso/page.h> - - #define HPAGE_SHIFT (PAGE_SHIFT + PAGE_SHIFT - 3) - #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) ---- a/arch/m68k/include/asm/page.h -+++ b/arch/m68k/include/asm/page.h -@@ -6,10 +6,8 @@ - #include <asm/setup.h> - #include <asm/page_offset.h> - --/* PAGE_SHIFT determines the page size */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> -+ - #define PAGE_OFFSET (PAGE_OFFSET_RAW) - - #ifndef __ASSEMBLY__ ---- a/arch/microblaze/include/asm/page.h -+++ b/arch/microblaze/include/asm/page.h -@@ -19,10 +19,7 @@ - - #ifdef __KERNEL__ - --/* PAGE_SHIFT determines the page size */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - #define LOAD_OFFSET ASM_CONST((CONFIG_KERNEL_START-CONFIG_KERNEL_BASE_ADDR)) - ---- a/arch/mips/include/asm/page.h -+++ b/arch/mips/include/asm/page.h -@@ -14,12 +14,7 @@ - #include <linux/kernel.h> - #include <asm/mipsregs.h> - --/* -- * PAGE_SHIFT determines the page size -- */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) --#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) -+#include <vdso/page.h> - - /* - * This is used for calculating the real page sizes ---- a/arch/nios2/include/asm/page.h -+++ b/arch/nios2/include/asm/page.h -@@ -18,12 +18,7 @@ - #include <linux/pfn.h> - #include <linux/const.h> - --/* -- * PAGE_SHIFT determines the page size -- */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE - 1)) -+#include <vdso/page.h> - - /* - * PAGE_OFFSET -- the first address of the first page of memory. ---- a/arch/openrisc/include/asm/page.h -+++ b/arch/openrisc/include/asm/page.h -@@ -15,16 +15,7 @@ - #ifndef __ASM_OPENRISC_PAGE_H - #define __ASM_OPENRISC_PAGE_H - -- --/* PAGE_SHIFT determines the page size */ -- --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#ifdef __ASSEMBLY__ --#define PAGE_SIZE (1 << PAGE_SHIFT) --#else --#define PAGE_SIZE (1UL << PAGE_SHIFT) --#endif --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - #define PAGE_OFFSET 0xc0000000 - #define KERNELBASE PAGE_OFFSET ---- a/arch/parisc/include/asm/page.h -+++ b/arch/parisc/include/asm/page.h -@@ -4,9 +4,7 @@ - - #include <linux/const.h> - --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA - ---- a/arch/powerpc/include/asm/page.h -+++ b/arch/powerpc/include/asm/page.h -@@ -21,8 +21,7 @@ - * page size. When using 64K pages however, whether we are really supporting - * 64K pages in HW or not is irrelevant to those definitions. - */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) -+#include <vdso/page.h> - - #ifndef __ASSEMBLY__ - #ifndef CONFIG_HUGETLB_PAGE -@@ -42,13 +41,6 @@ extern unsigned int hpage_shift; - #endif - - /* -- * Subtle: (1 << PAGE_SHIFT) is an int, not an unsigned long. So if we -- * assign PAGE_MASK to a larger type it gets extended the way we want -- * (i.e. with 1s in the high bits) -- */ --#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) -- --/* - * KERNELBASE is the virtual address of the start of the kernel, it's often - * the same as PAGE_OFFSET, but _might not be_. - * ---- a/arch/riscv/include/asm/page.h -+++ b/arch/riscv/include/asm/page.h -@@ -12,9 +12,7 @@ - #include <linux/pfn.h> - #include <linux/const.h> - --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE - 1)) -+#include <vdso/page.h> - - #define HPAGE_SHIFT PMD_SHIFT - #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) ---- a/arch/s390/include/asm/page.h -+++ b/arch/s390/include/asm/page.h -@@ -11,14 +11,11 @@ - #include <linux/const.h> - #include <asm/types.h> - --#define _PAGE_SHIFT CONFIG_PAGE_SHIFT --#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT) --#define _PAGE_MASK (~(_PAGE_SIZE - 1)) -+#include <vdso/page.h> - --/* PAGE_SHIFT determines the page size */ --#define PAGE_SHIFT _PAGE_SHIFT --#define PAGE_SIZE _PAGE_SIZE --#define PAGE_MASK _PAGE_MASK -+#define _PAGE_SHIFT PAGE_SHIFT -+#define _PAGE_SIZE PAGE_SIZE -+#define _PAGE_MASK PAGE_MASK - #define PAGE_DEFAULT_ACC _AC(0, UL) - /* storage-protection override */ - #define PAGE_SPO_ACC 9 ---- a/arch/sh/include/asm/page.h -+++ b/arch/sh/include/asm/page.h -@@ -8,10 +8,8 @@ - - #include <linux/const.h> - --/* PAGE_SHIFT determines the page size */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> -+ - #define PTE_MASK PAGE_MASK - - #if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) ---- a/arch/sparc/include/asm/page_32.h -+++ b/arch/sparc/include/asm/page_32.h -@@ -11,9 +11,7 @@ - - #include <linux/const.h> - --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - #ifndef __ASSEMBLY__ - ---- a/arch/sparc/include/asm/page_64.h -+++ b/arch/sparc/include/asm/page_64.h -@@ -4,9 +4,7 @@ - - #include <linux/const.h> - --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - /* Flushing for D-cache alias handling is only needed if - * the page size is smaller than 16K. ---- a/arch/um/include/asm/page.h -+++ b/arch/um/include/asm/page.h -@@ -9,10 +9,7 @@ - - #include <linux/const.h> - --/* PAGE_SHIFT determines the page size */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - #ifndef __ASSEMBLY__ - ---- a/arch/x86/include/asm/page_types.h -+++ b/arch/x86/include/asm/page_types.h -@@ -6,10 +6,7 @@ - #include <linux/types.h> - #include <linux/mem_encrypt.h> - --/* PAGE_SHIFT determines the page size */ --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) - ---- a/arch/xtensa/include/asm/page.h -+++ b/arch/xtensa/include/asm/page.h -@@ -18,13 +18,7 @@ - #include <asm/cache.h> - #include <asm/kmem_layout.h> - --/* -- * PAGE_SHIFT determines the page size -- */ -- --#define PAGE_SHIFT CONFIG_PAGE_SHIFT --#define PAGE_SIZE (__XTENSA_UL_CONST(1) << PAGE_SHIFT) --#define PAGE_MASK (~(PAGE_SIZE-1)) -+#include <vdso/page.h> - - #ifdef CONFIG_MMU - #define PAGE_OFFSET XCHAL_KSEG_CACHED_VADDR ---- /dev/null -+++ b/include/vdso/page.h -@@ -0,0 +1,30 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef __VDSO_PAGE_H -+#define __VDSO_PAGE_H -+ -+#include <uapi/linux/const.h> -+ -+/* -+ * PAGE_SHIFT determines the page size. -+ * -+ * Note: This definition is required because PAGE_SHIFT is used -+ * in several places throuout the codebase. -+ */ -+#define PAGE_SHIFT CONFIG_PAGE_SHIFT -+ -+#define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT) -+ -+#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT) -+/* -+ * Applies only to 32-bit architectures with a 64-bit phys_addr_t. -+ * -+ * Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long. -+ * So if we assign PAGE_MASK to a larger type it gets extended the -+ * way we want (i.e. with 1s in the high bits) -+ */ -+#define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1)) -+#else -+#define PAGE_MASK (~(PAGE_SIZE - 1)) -+#endif -+ -+#endif /* __VDSO_PAGE_H */ diff --git a/debian/patches/patchset-zen/invlpgb-v9/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch b/debian/patches/patchset-zen/invlpgb-v9/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch deleted file mode 100644 index f37d8df..0000000 --- a/debian/patches/patchset-zen/invlpgb-v9/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch +++ /dev/null @@ -1,68 +0,0 @@ -From d1bcf51400e790e65945a29078bd816bd61aa148 Mon Sep 17 00:00:00 2001 -From: Arnd Bergmann <arnd@arndb.de> -Date: Thu, 24 Oct 2024 13:34:26 +0000 -Subject: vdso: Change PAGE_MASK to signed on all 32-bit architectures - -With the introduction of an architecture-independent defintion of -PAGE_MASK, we had to make a choice between defining it as 'unsigned long' -as on 64-bit architectures, or as signed 'long' as required for -architectures with a 64-bit phys_addr_t. - -To reduce the risk for regressions and minimize the changes in behavior, -the result was using the signed value only when CONFIG_PHYS_ADDR_T_64BIT -is set, but that ended up causing a regression after all in the -early_init_dt_add_memory_arch() function that uses 64-bit integers for -address calculation. - -Presumably the same regression also affects mips32 and powerpc32 when -dealing with large amounts of memory on DT platforms: like arm32, they were -using the signed version unconditionally. - -The two most sensible options for addressing the regression are either to -go back to an architecture specific definition, using a signed constant on -arm/powerpc/mips and unsigned on the others, or to use the same definition -everywhere. - -Use the simpler of those two and change them all to the signed version, in -the hope that this does not cause a different type of bug. Most of the -other 32-bit architectures have no large physical address support and are -rarely used, so it seems more likely that using the same definition helps -than hurts here. - -In particular, x86-32 does have physical addressing extensions, so it -already changed to the signed version after the previous patch, so it makes -sense to use the same version on non-PAE as well. - -Fixes: efe8419ae78d ("vdso: Introduce vdso/page.h") -Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org> -Signed-off-by: Arnd Bergmann <arnd@arndb.de> -Signed-off-by: Thomas Gleixner <tglx@linutronix.de> -Tested-by: Anders Roxell <anders.roxell@linaro.org> -Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com> -Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com> -Link: https://lore.kernel.org/all/20241024133447.3117273-1-arnd@kernel.org -Link: https://lore.kernel.org/lkml/CA+G9fYt86bUAu_v5dXPWnDUwQNVipj+Wq3Djir1KUSKdr9QLNg@mail.gmail.com/ ---- - include/vdso/page.h | 7 ++++--- - 1 file changed, 4 insertions(+), 3 deletions(-) - ---- a/include/vdso/page.h -+++ b/include/vdso/page.h -@@ -14,13 +14,14 @@ - - #define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT) - --#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT) -+#if !defined(CONFIG_64BIT) - /* -- * Applies only to 32-bit architectures with a 64-bit phys_addr_t. -+ * Applies only to 32-bit architectures. - * - * Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long. - * So if we assign PAGE_MASK to a larger type it gets extended the -- * way we want (i.e. with 1s in the high bits) -+ * way we want (i.e. with 1s in the high bits) while masking a -+ * 64-bit value such as phys_addr_t. - */ - #define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1)) - #else diff --git a/debian/patches/patchset-zen/invlpgb-v9/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch b/debian/patches/patchset-zen/invlpgb/0001-x86-mm-Make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch similarity index 56% rename from debian/patches/patchset-zen/invlpgb-v9/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch rename to debian/patches/patchset-zen/invlpgb/0001-x86-mm-Make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch index 8c3a174..46cff6a 100644 --- a/debian/patches/patchset-zen/invlpgb-v9/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch +++ b/debian/patches/patchset-zen/invlpgb/0001-x86-mm-Make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch @@ -1,7 +1,7 @@ -From e11153c4df0fee7caadec3714a60a4936d6a9ea2 Mon Sep 17 00:00:00 2001 +From 1901291057a3f1bf2bf94c7a4ddf3253d3116acb Mon Sep 17 00:00:00 2001 From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:20 -0500 -Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional +Date: Thu, 13 Feb 2025 11:13:52 -0500 +Subject: x86/mm: Make MMU_GATHER_RCU_TABLE_FREE unconditional Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using paravirt, and not when running on bare metal. @@ -11,8 +11,9 @@ each setup. Make them all the same. Currently get_user_pages_fast synchronizes against page table freeing in two different ways: -- on bare metal, by blocking IRQs, which block TLB flush IPIs -- on paravirt, with MMU_GATHER_RCU_TABLE_FREE + + - on bare metal, by blocking IRQs, which block TLB flush IPIs + - on paravirt, with MMU_GATHER_RCU_TABLE_FREE This is done because some paravirt TLB flush implementations handle the TLB flush in the hypervisor, and will do the flush @@ -27,18 +28,22 @@ as an implicit way to block RCU frees. That makes it safe to use INVLPGB on AMD CPUs. -Signed-off-by: Rik van Riel <riel@surriel.com> Suggested-by: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Ingo Molnar <mingo@kernel.org> Tested-by: Manali Shukla <Manali.Shukla@amd.com> +Tested-by: Brendan Jackman <jackmanb@google.com> +Tested-by: Michael Kelley <mhklinux@outlook.com> +Link: https://lore.kernel.org/r/20250213161423.449435-2-riel@surriel.com --- arch/x86/Kconfig | 2 +- - arch/x86/kernel/paravirt.c | 7 +------ - arch/x86/mm/pgtable.c | 16 ++++------------ - 3 files changed, 6 insertions(+), 19 deletions(-) + arch/x86/kernel/paravirt.c | 17 +---------------- + arch/x86/mm/pgtable.c | 27 ++++----------------------- + 3 files changed, 6 insertions(+), 40 deletions(-) --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig -@@ -270,7 +270,7 @@ config X86 +@@ -277,7 +277,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP @@ -49,19 +54,29 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com> select HAVE_REGS_AND_STACK_ACCESS_API --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c -@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void) +@@ -59,21 +59,6 @@ void __init native_pv_lock_init(void) static_branch_enable(&virt_spin_lock_key); } +-#ifndef CONFIG_PT_RECLAIM -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ -- tlb_remove_page(tlb, table); +- struct ptdesc *ptdesc = (struct ptdesc *)table; +- +- pagetable_dtor(ptdesc); +- tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} +-#else +-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- tlb_remove_table(tlb, table); +-} +-#endif - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops = +@@ -195,7 +180,7 @@ struct paravirt_patch_template pv_ops = .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, @@ -72,53 +87,63 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com> .mmu.notify_page_enc_status_changed = paravirt_nop, --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c -@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask); +@@ -18,25 +18,6 @@ EXPORT_SYMBOL(physical_mask); #define PGTABLE_HIGHMEM 0 #endif -#ifndef CONFIG_PARAVIRT +-#ifndef CONFIG_PT_RECLAIM -static inline -void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ -- tlb_remove_page(tlb, table); +- struct ptdesc *ptdesc = (struct ptdesc *)table; +- +- pagetable_dtor(ptdesc); +- tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} --#endif +-#else +-static inline +-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- tlb_remove_table(tlb, table); +-} +-#endif /* !CONFIG_PT_RECLAIM */ +-#endif /* !CONFIG_PARAVIRT */ - gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) -@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather * +@@ -64,7 +45,7 @@ early_param("userpte", setup_userpte); + void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pagetable_pte_dtor(page_ptdesc(pte)); paravirt_release_pte(page_to_pfn(pte)); -- paravirt_tlb_remove_table(tlb, pte); -+ tlb_remove_table(tlb, pte); +- paravirt_tlb_remove_table(tlb, page_ptdesc(pte)); ++ tlb_remove_table(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 -@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather * +@@ -78,21 +59,21 @@ void ___pmd_free_tlb(struct mmu_gather * + #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - pagetable_pmd_dtor(ptdesc); -- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); -+ tlb_remove_table(tlb, ptdesc_page(ptdesc)); +- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd)); ++ tlb_remove_table(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 -@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather * - - pagetable_pud_dtor(ptdesc); + void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) + { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); -- paravirt_tlb_remove_table(tlb, virt_to_page(pud)); -+ tlb_remove_table(tlb, virt_to_page(pud)); +- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud)); ++ tlb_remove_table(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); -- paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); -+ tlb_remove_table(tlb, virt_to_page(p4d)); +- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d)); ++ tlb_remove_table(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/debian/patches/patchset-zen/invlpgb-v9/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch b/debian/patches/patchset-zen/invlpgb/0002-x86-mm-Remove-pv_ops.mmu.tlb_remove_table-call.patch similarity index 84% rename from debian/patches/patchset-zen/invlpgb-v9/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch rename to debian/patches/patchset-zen/invlpgb/0002-x86-mm-Remove-pv_ops.mmu.tlb_remove_table-call.patch index 6d0760f..460ee01 100644 --- a/debian/patches/patchset-zen/invlpgb-v9/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch +++ b/debian/patches/patchset-zen/invlpgb/0002-x86-mm-Remove-pv_ops.mmu.tlb_remove_table-call.patch @@ -1,16 +1,20 @@ -From e8008cb69c5e4efbaedd70b0fb692343e4aa0e51 Mon Sep 17 00:00:00 2001 +From 002a3e971d0d7987bdcdd564eccfa3dd63637226 Mon Sep 17 00:00:00 2001 From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:21 -0500 -Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call +Date: Thu, 13 Feb 2025 11:13:53 -0500 +Subject: x86/mm: Remove pv_ops.mmu.tlb_remove_table call Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table. Get rid of the indirection by simply calling tlb_remove_table directly, and not going through the paravirt function pointers. -Signed-off-by: Rik van Riel <riel@surriel.com> Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com> +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Ingo Molnar <mingo@kernel.org> Tested-by: Manali Shukla <Manali.Shukla@amd.com> +Tested-by: Brendan Jackman <jackmanb@google.com> +Tested-by: Michael Kelley <mhklinux@outlook.com> +Link: https://lore.kernel.org/r/20250213161423.449435-3-riel@surriel.com --- arch/x86/hyperv/mmu.c | 1 - arch/x86/include/asm/paravirt.h | 5 ----- @@ -22,7 +26,7 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com> --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c -@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void) +@@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void) pr_info("Using hypercall for remote TLB flush\n"); pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; @@ -44,7 +48,7 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com> PVOP_VCALL1(mmu.exit_mmap, mm); --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h -@@ -136,8 +136,6 @@ struct pv_mmu_ops { +@@ -134,8 +134,6 @@ struct pv_mmu_ops { void (*flush_tlb_multi)(const struct cpumask *cpus, const struct flush_tlb_info *info); @@ -65,7 +69,7 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com> --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c -@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops = +@@ -180,7 +180,6 @@ struct paravirt_patch_template pv_ops = .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, diff --git a/debian/patches/patchset-zen/invlpgb/0003-x86-mm-Consolidate-full-flush-threshold-decision.patch b/debian/patches/patchset-zen/invlpgb/0003-x86-mm-Consolidate-full-flush-threshold-decision.patch new file mode 100644 index 0000000..2ee5563 --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0003-x86-mm-Consolidate-full-flush-threshold-decision.patch @@ -0,0 +1,87 @@ +From d4784e28cc2e488fce80ded0ff086c50244593ca Mon Sep 17 00:00:00 2001 +From: Rik van Riel <riel@surriel.com> +Date: Tue, 25 Feb 2025 22:00:36 -0500 +Subject: x86/mm: Consolidate full flush threshold decision + +Reduce code duplication by consolidating the decision point for whether to do +individual invalidations or a full flush inside get_flush_tlb_info(). + +Suggested-by: Dave Hansen <dave.hansen@intel.com> +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> +Acked-by: Dave Hansen <dave.hansen@intel.com> +Link: https://lore.kernel.org/r/20250226030129.530345-2-riel@surriel.com +--- + arch/x86/mm/tlb.c | 41 +++++++++++++++++++---------------------- + 1 file changed, 19 insertions(+), 22 deletions(-) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1000,6 +1000,15 @@ static struct flush_tlb_info *get_flush_ + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); + #endif + ++ /* ++ * If the number of flushes is so large that a full flush ++ * would be faster, do a full flush. ++ */ ++ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) { ++ start = 0; ++ end = TLB_FLUSH_ALL; ++ } ++ + info->start = start; + info->end = end; + info->mm = mm; +@@ -1026,17 +1035,8 @@ void flush_tlb_mm_range(struct mm_struct + bool freed_tables) + { + struct flush_tlb_info *info; ++ int cpu = get_cpu(); + u64 new_tlb_gen; +- int cpu; +- +- cpu = get_cpu(); +- +- /* Should we flush just the requested range? */ +- if ((end == TLB_FLUSH_ALL) || +- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { +- start = 0; +- end = TLB_FLUSH_ALL; +- } + + /* This is also a barrier that synchronizes with switch_mm(). */ + new_tlb_gen = inc_mm_tlb_gen(mm); +@@ -1089,22 +1089,19 @@ static void do_kernel_range_flush(void * + + void flush_tlb_kernel_range(unsigned long start, unsigned long end) + { +- /* Balance as user space task's flush, a bit conservative */ +- if (end == TLB_FLUSH_ALL || +- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { +- on_each_cpu(do_flush_tlb_all, NULL, 1); +- } else { +- struct flush_tlb_info *info; ++ struct flush_tlb_info *info; + +- preempt_disable(); +- info = get_flush_tlb_info(NULL, start, end, 0, false, +- TLB_GENERATION_INVALID); ++ guard(preempt)(); + ++ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false, ++ TLB_GENERATION_INVALID); ++ ++ if (info->end == TLB_FLUSH_ALL) ++ on_each_cpu(do_flush_tlb_all, NULL, 1); ++ else + on_each_cpu(do_kernel_range_flush, info, 1); + +- put_flush_tlb_info(); +- preempt_enable(); +- } ++ put_flush_tlb_info(); + } + + /* diff --git a/debian/patches/patchset-zen/invlpgb/0004-x86-mm-Add-INVLPGB-feature-and-Kconfig-entry.patch b/debian/patches/patchset-zen/invlpgb/0004-x86-mm-Add-INVLPGB-feature-and-Kconfig-entry.patch new file mode 100644 index 0000000..e717e5b --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0004-x86-mm-Add-INVLPGB-feature-and-Kconfig-entry.patch @@ -0,0 +1,103 @@ +From e5d151337c384934c9b669967d72f9b29781b308 Mon Sep 17 00:00:00 2001 +From: Rik van Riel <riel@surriel.com> +Date: Tue, 25 Feb 2025 22:00:37 -0500 +Subject: x86/mm: Add INVLPGB feature and Kconfig entry + +In addition, the CPU advertises the maximum number of pages that can be +shot down with one INVLPGB instruction in CPUID. Save that information +for later use. + + [ bp: use cpu_has(), typos, massage. ] + +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Link: https://lore.kernel.org/r/20250226030129.530345-3-riel@surriel.com +--- + arch/x86/Kconfig.cpu | 4 ++++ + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/disabled-features.h | 8 +++++++- + arch/x86/include/asm/tlbflush.h | 3 +++ + arch/x86/kernel/cpu/amd.c | 6 ++++++ + 5 files changed, 21 insertions(+), 1 deletion(-) + +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -740,6 +740,10 @@ menuconfig PROCESSOR_SELECT + This lets you choose what x86 vendor support code your kernel + will include. + ++config BROADCAST_TLB_FLUSH ++ def_bool y ++ depends on CPU_SUP_AMD && 64BIT ++ + config CPU_SUP_INTEL + default y + bool "Support Intel processors" if PROCESSOR_SELECT +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -338,6 +338,7 @@ + #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ + #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ ++#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */ + #define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ + #define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ + #define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -129,6 +129,12 @@ + #define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) + #endif + ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++#define DISABLE_INVLPGB 0 ++#else ++#define DISABLE_INVLPGB (1 << (X86_FEATURE_INVLPGB & 31)) ++#endif ++ + /* + * Make sure to add features to the correct mask + */ +@@ -146,7 +152,7 @@ + #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ + DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) + #define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) +-#define DISABLED_MASK13 0 ++#define DISABLED_MASK13 (DISABLE_INVLPGB) + #define DISABLED_MASK14 0 + #define DISABLED_MASK15 0 + #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -183,6 +183,9 @@ static inline void cr4_init_shadow(void) + extern unsigned long mmu_cr4_features; + extern u32 *trampoline_cr4_features; + ++/* How many pages can be invalidated with one INVLPGB. */ ++extern u16 invlpgb_count_max; ++ + extern void initialize_tlbstate_and_flush(void); + + /* +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -29,6 +29,8 @@ + + #include "cpu.h" + ++u16 invlpgb_count_max __ro_after_init; ++ + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) + { + u32 gprs[8] = { 0 }; +@@ -1139,6 +1141,10 @@ static void cpu_detect_tlb_amd(struct cp + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; ++ ++ /* Max number of pages INVLPGB can invalidate in one shot */ ++ if (cpu_has(c, X86_FEATURE_INVLPGB)) ++ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1; + } + + static const struct cpu_dev amd_cpu_dev = { diff --git a/debian/patches/patchset-zen/invlpgb/0005-x86-mm-Add-INVLPGB-support-code.patch b/debian/patches/patchset-zen/invlpgb/0005-x86-mm-Add-INVLPGB-support-code.patch new file mode 100644 index 0000000..509a237 --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0005-x86-mm-Add-INVLPGB-support-code.patch @@ -0,0 +1,170 @@ +From 9bbface3289771c5990e97ca047a52faaeafdb83 Mon Sep 17 00:00:00 2001 +From: Rik van Riel <riel@surriel.com> +Date: Fri, 28 Feb 2025 20:32:30 +0100 +Subject: x86/mm: Add INVLPGB support code + +Add helper functions and definitions needed to use broadcast TLB +invalidation on AMD CPUs. + + [ bp: + - Cleanup commit message + - Improve and expand comments + - push the preemption guards inside the invlpgb* helpers + - merge improvements from dhansen + - add !CONFIG_BROADCAST_TLB_FLUSH function stubs because Clang + can't do DCE properly yet and looks at the inline asm and + complains about it getting a u64 argument on 32-bit code ] + +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Link: https://lore.kernel.org/r/20250226030129.530345-4-riel@surriel.com +--- + arch/x86/include/asm/tlb.h | 132 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 132 insertions(+) + +--- a/arch/x86/include/asm/tlb.h ++++ b/arch/x86/include/asm/tlb.h +@@ -6,6 +6,9 @@ + static inline void tlb_flush(struct mmu_gather *tlb); + + #include <asm-generic/tlb.h> ++#include <linux/kernel.h> ++#include <vdso/bits.h> ++#include <vdso/page.h> + + static inline void tlb_flush(struct mmu_gather *tlb) + { +@@ -25,4 +28,133 @@ static inline void invlpg(unsigned long + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + } + ++enum addr_stride { ++ PTE_STRIDE = 0, ++ PMD_STRIDE = 1 ++}; ++ ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++/* ++ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. ++ * ++ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can ++ * be done in a parallel fashion. ++ * ++ * The instruction takes the number of extra pages to invalidate, beyond ++ * the first page, while __invlpgb gets the more human readable number of ++ * pages to invalidate. ++ * ++ * The bits in rax[0:2] determine respectively which components of the address ++ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any* ++ * address in the specified range matches. ++ * ++ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from ++ * this CPU have completed. ++ */ ++static inline void __invlpgb(unsigned long asid, unsigned long pcid, ++ unsigned long addr, u16 nr_pages, ++ enum addr_stride stride, u8 flags) ++{ ++ u32 edx = (pcid << 16) | asid; ++ u32 ecx = (stride << 31) | (nr_pages - 1); ++ u64 rax = addr | flags; ++ ++ /* The low bits in rax are for flags. Verify addr is clean. */ ++ VM_WARN_ON_ONCE(addr & ~PAGE_MASK); ++ ++ /* INVLPGB; supported in binutils >= 2.36. */ ++ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx)); ++} ++ ++static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) ++{ ++ __invlpgb(asid, pcid, 0, 1, 0, flags); ++} ++ ++static inline void __tlbsync(void) ++{ ++ /* ++ * TLBSYNC waits for INVLPGB instructions originating on the same CPU ++ * to have completed. Print a warning if the task has been migrated, ++ * and might not be waiting on all the INVLPGBs issued during this TLB ++ * invalidation sequence. ++ */ ++ cant_migrate(); ++ ++ /* TLBSYNC: supported in binutils >= 0.36. */ ++ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); ++} ++#else ++/* Some compilers (I'm looking at you clang!) simply can't do DCE */ ++static inline void __invlpgb(unsigned long asid, unsigned long pcid, ++ unsigned long addr, u16 nr_pages, ++ enum addr_stride s, u8 flags) { } ++static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { } ++static inline void __tlbsync(void) { } ++#endif ++ ++/* ++ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination ++ * of the three. For example: ++ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address ++ * - FLAG_PCID: invalidate all TLB entries matching the PCID ++ * ++ * The first is used to invalidate (kernel) mappings at a particular ++ * address across all processes. ++ * ++ * The latter invalidates all TLB entries matching a PCID. ++ */ ++#define INVLPGB_FLAG_VA BIT(0) ++#define INVLPGB_FLAG_PCID BIT(1) ++#define INVLPGB_FLAG_ASID BIT(2) ++#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) ++#define INVLPGB_FLAG_FINAL_ONLY BIT(4) ++#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) ++ ++/* The implied mode when all bits are clear: */ ++#define INVLPGB_MODE_ALL_NONGLOBALS 0UL ++ ++static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, ++ unsigned long addr, ++ u16 nr, bool stride) ++{ ++ enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE; ++ u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA; ++ ++ __invlpgb(0, pcid, addr, nr, str, flags); ++} ++ ++/* Flush all mappings for a given PCID, not including globals. */ ++static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) ++{ ++ __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID); ++} ++ ++/* Flush all mappings, including globals, for all PCIDs. */ ++static inline void invlpgb_flush_all(void) ++{ ++ /* ++ * TLBSYNC at the end needs to make sure all flushes done on the ++ * current CPU have been executed system-wide. Therefore, make ++ * sure nothing gets migrated in-between but disable preemption ++ * as it is cheaper. ++ */ ++ guard(preempt)(); ++ __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL); ++ __tlbsync(); ++} ++ ++/* Flush addr, including globals, for all PCIDs. */ ++static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) ++{ ++ __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL); ++} ++ ++/* Flush all mappings for all PCIDs except globals. */ ++static inline void invlpgb_flush_all_nonglobals(void) ++{ ++ guard(preempt)(); ++ __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS); ++ __tlbsync(); ++} + #endif /* _ASM_X86_TLB_H */ diff --git a/debian/patches/patchset-zen/invlpgb/0006-x86-mm-Use-INVLPGB-for-kernel-TLB-flushes.patch b/debian/patches/patchset-zen/invlpgb/0006-x86-mm-Use-INVLPGB-for-kernel-TLB-flushes.patch new file mode 100644 index 0000000..d2d2e70 --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0006-x86-mm-Use-INVLPGB-for-kernel-TLB-flushes.patch @@ -0,0 +1,97 @@ +From 293fdf15ead45cd235e12a4f62f81767f7bce528 Mon Sep 17 00:00:00 2001 +From: Rik van Riel <riel@surriel.com> +Date: Tue, 25 Feb 2025 22:00:39 -0500 +Subject: x86/mm: Use INVLPGB for kernel TLB flushes + +Use broadcast TLB invalidation for kernel addresses when available. +Remove the need to send IPIs for kernel TLB flushes. + + [ bp: Integrate dhansen's comments additions, merge the + flush_tlb_all() change into this one too. ] + +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Link: https://lore.kernel.org/r/20250226030129.530345-5-riel@surriel.com +--- + arch/x86/mm/tlb.c | 48 +++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 44 insertions(+), 4 deletions(-) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1064,7 +1064,6 @@ void flush_tlb_mm_range(struct mm_struct + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); + } + +- + static void do_flush_tlb_all(void *info) + { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); +@@ -1074,7 +1073,32 @@ static void do_flush_tlb_all(void *info) + void flush_tlb_all(void) + { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); +- on_each_cpu(do_flush_tlb_all, NULL, 1); ++ ++ /* First try (faster) hardware-assisted TLB invalidation. */ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ invlpgb_flush_all(); ++ else ++ /* Fall back to the IPI-based invalidation. */ ++ on_each_cpu(do_flush_tlb_all, NULL, 1); ++} ++ ++/* Flush an arbitrarily large range of memory with INVLPGB. */ ++static void invlpgb_kernel_range_flush(struct flush_tlb_info *info) ++{ ++ unsigned long addr, nr; ++ ++ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) { ++ nr = (info->end - addr) >> PAGE_SHIFT; ++ ++ /* ++ * INVLPGB has a limit on the size of ranges it can ++ * flush. Break up large flushes. ++ */ ++ nr = clamp_val(nr, 1, invlpgb_count_max); ++ ++ invlpgb_flush_addr_nosync(addr, nr); ++ } ++ __tlbsync(); + } + + static void do_kernel_range_flush(void *info) +@@ -1087,6 +1111,22 @@ static void do_kernel_range_flush(void * + flush_tlb_one_kernel(addr); + } + ++static void kernel_tlb_flush_all(struct flush_tlb_info *info) ++{ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ invlpgb_flush_all(); ++ else ++ on_each_cpu(do_flush_tlb_all, NULL, 1); ++} ++ ++static void kernel_tlb_flush_range(struct flush_tlb_info *info) ++{ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ invlpgb_kernel_range_flush(info); ++ else ++ on_each_cpu(do_kernel_range_flush, info, 1); ++} ++ + void flush_tlb_kernel_range(unsigned long start, unsigned long end) + { + struct flush_tlb_info *info; +@@ -1097,9 +1137,9 @@ void flush_tlb_kernel_range(unsigned lon + TLB_GENERATION_INVALID); + + if (info->end == TLB_FLUSH_ALL) +- on_each_cpu(do_flush_tlb_all, NULL, 1); ++ kernel_tlb_flush_all(info); + else +- on_each_cpu(do_kernel_range_flush, info, 1); ++ kernel_tlb_flush_range(info); + + put_flush_tlb_info(); + } diff --git a/debian/patches/patchset-zen/invlpgb-v9/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch b/debian/patches/patchset-zen/invlpgb/0007-x86-mm-Use-broadcast-TLB-flushing-in-page-reclaim.patch similarity index 58% rename from debian/patches/patchset-zen/invlpgb-v9/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch rename to debian/patches/patchset-zen/invlpgb/0007-x86-mm-Use-broadcast-TLB-flushing-in-page-reclaim.patch index cf28af9..5f38a1b 100644 --- a/debian/patches/patchset-zen/invlpgb-v9/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch +++ b/debian/patches/patchset-zen/invlpgb/0007-x86-mm-Use-broadcast-TLB-flushing-in-page-reclaim.patch @@ -1,23 +1,25 @@ -From 5e5219596683c3b8178e09f6ec1e75154537325f Mon Sep 17 00:00:00 2001 +From a093136bdb306345cd686f47c8faae8a608cfb47 Mon Sep 17 00:00:00 2001 From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:27 -0500 -Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing +Date: Tue, 25 Feb 2025 22:00:41 -0500 +Subject: x86/mm: Use broadcast TLB flushing in page reclaim -In the page reclaim code, we only track the CPU(s) where the TLB needs -to be flushed, rather than all the individual mappings that may be getting -invalidated. +Page reclaim tracks only the CPU(s) where the TLB needs to be flushed, rather +than all the individual mappings that may be getting invalidated. Use broadcast TLB flushing when that is available. + [ bp: Massage commit message. ] + Signed-off-by: Rik van Riel <riel@surriel.com> -Tested-by: Manali Shukla <Manali.Shukla@amd.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Link: https://lore.kernel.org/r/20250226030129.530345-7-riel@surriel.com --- arch/x86/mm/tlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c -@@ -1330,7 +1330,9 @@ void arch_tlbbatch_flush(struct arch_tlb +@@ -1320,7 +1320,9 @@ void arch_tlbbatch_flush(struct arch_tlb * a local TLB flush is needed. Optimize this use-case by calling * flush_tlb_func_local() directly in this case. */ diff --git a/debian/patches/patchset-zen/invlpgb/0008-x86-mm-Add-global-ASID-allocation-helper-functions.patch b/debian/patches/patchset-zen/invlpgb/0008-x86-mm-Add-global-ASID-allocation-helper-functions.patch new file mode 100644 index 0000000..30c4b96 --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0008-x86-mm-Add-global-ASID-allocation-helper-functions.patch @@ -0,0 +1,286 @@ +From ef345ff96b47f21932c489edd2ebb44fbe3cb517 Mon Sep 17 00:00:00 2001 +From: Rik van Riel <riel@surriel.com> +Date: Tue, 25 Feb 2025 22:00:42 -0500 +Subject: x86/mm: Add global ASID allocation helper functions + +Add functions to manage global ASID space. Multithreaded processes that are +simultaneously active on 4 or more CPUs can get a global ASID, resulting in the +same PCID being used for that process on every CPU. + +This in turn will allow the kernel to use hardware-assisted TLB flushing +through AMD INVLPGB or Intel RAR for these processes. + + [ bp: + - Extend use_global_asid() comment + - s/X86_BROADCAST_TLB_FLUSH/BROADCAST_TLB_FLUSH/g + - other touchups ] + +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Link: https://lore.kernel.org/r/20250226030129.530345-8-riel@surriel.com +--- + arch/x86/include/asm/mmu.h | 12 +++ + arch/x86/include/asm/mmu_context.h | 2 + + arch/x86/include/asm/tlbflush.h | 37 +++++++ + arch/x86/mm/tlb.c | 154 ++++++++++++++++++++++++++++- + 4 files changed, 202 insertions(+), 3 deletions(-) + +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -69,6 +69,18 @@ typedef struct { + u16 pkey_allocation_map; + s16 execute_only_pkey; + #endif ++ ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++ /* ++ * The global ASID will be a non-zero value when the process has ++ * the same ASID across all CPUs, allowing it to make use of ++ * hardware-assisted remote TLB invalidation like AMD INVLPGB. ++ */ ++ u16 global_asid; ++ ++ /* The process is transitioning to a new global ASID number. */ ++ bool asid_transition; ++#endif + } mm_context_t; + + #define INIT_MM_CONTEXT(mm) \ +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s + #define enter_lazy_tlb enter_lazy_tlb + extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); + ++extern void mm_free_global_asid(struct mm_struct *mm); ++ + /* + * Init a new mm. Used on mm copies, like at fork() + * and on mm's that are brand-new, like at execve(). +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -6,6 +6,7 @@ + #include <linux/mmu_notifier.h> + #include <linux/sched.h> + ++#include <asm/barrier.h> + #include <asm/processor.h> + #include <asm/cpufeature.h> + #include <asm/special_insns.h> +@@ -234,6 +235,42 @@ void flush_tlb_one_kernel(unsigned long + void flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info); + ++static inline bool is_dyn_asid(u16 asid) ++{ ++ return asid < TLB_NR_DYN_ASIDS; ++} ++ ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++static inline u16 mm_global_asid(struct mm_struct *mm) ++{ ++ u16 asid; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return 0; ++ ++ asid = smp_load_acquire(&mm->context.global_asid); ++ ++ /* mm->context.global_asid is either 0, or a global ASID */ ++ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid)); ++ ++ return asid; ++} ++ ++static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) ++{ ++ /* ++ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() -> ++ * finish_asid_transition() needs to observe asid_transition = true ++ * once it observes global_asid. ++ */ ++ mm->context.asid_transition = true; ++ smp_store_release(&mm->context.global_asid, asid); ++} ++#else ++static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; } ++static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { } ++#endif /* CONFIG_BROADCAST_TLB_FLUSH */ ++ + #ifdef CONFIG_PARAVIRT + #include <asm/paravirt.h> + #endif +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -74,13 +74,15 @@ + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] +- * the canonical identifier for an mm ++ * the canonical identifier for an mm, dynamically allocated on each CPU ++ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1] ++ * the canonical, global identifier for an mm, identical across all CPUs + * +- * kPCID - [1, TLB_NR_DYN_ASIDS] ++ * kPCID - [1, MAX_ASID_AVAILABLE] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * +- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] ++ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. +@@ -252,6 +254,152 @@ static void choose_new_asid(struct mm_st + } + + /* ++ * Global ASIDs are allocated for multi-threaded processes that are ++ * active on multiple CPUs simultaneously, giving each of those ++ * processes the same PCID on every CPU, for use with hardware-assisted ++ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR. ++ * ++ * These global ASIDs are held for the lifetime of the process. ++ */ ++static DEFINE_RAW_SPINLOCK(global_asid_lock); ++static u16 last_global_asid = MAX_ASID_AVAILABLE; ++static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE); ++static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE); ++static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1; ++ ++/* ++ * When the search for a free ASID in the global ASID space reaches ++ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously ++ * freed global ASIDs are safe to re-use. ++ * ++ * This way the global flush only needs to happen at ASID rollover ++ * time, and not at ASID allocation time. ++ */ ++static void reset_global_asid_space(void) ++{ ++ lockdep_assert_held(&global_asid_lock); ++ ++ invlpgb_flush_all_nonglobals(); ++ ++ /* ++ * The TLB flush above makes it safe to re-use the previously ++ * freed global ASIDs. ++ */ ++ bitmap_andnot(global_asid_used, global_asid_used, ++ global_asid_freed, MAX_ASID_AVAILABLE); ++ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE); ++ ++ /* Restart the search from the start of global ASID space. */ ++ last_global_asid = TLB_NR_DYN_ASIDS; ++} ++ ++static u16 allocate_global_asid(void) ++{ ++ u16 asid; ++ ++ lockdep_assert_held(&global_asid_lock); ++ ++ /* The previous allocation hit the edge of available address space */ ++ if (last_global_asid >= MAX_ASID_AVAILABLE - 1) ++ reset_global_asid_space(); ++ ++ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid); ++ ++ if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) { ++ /* This should never happen. */ ++ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", ++ global_asid_available); ++ return 0; ++ } ++ ++ /* Claim this global ASID. */ ++ __set_bit(asid, global_asid_used); ++ last_global_asid = asid; ++ global_asid_available--; ++ return asid; ++} ++ ++/* ++ * Check whether a process is currently active on more than @threshold CPUs. ++ * This is a cheap estimation on whether or not it may make sense to assign ++ * a global ASID to this process, and use broadcast TLB invalidation. ++ */ ++static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold) ++{ ++ int count = 0; ++ int cpu; ++ ++ /* This quick check should eliminate most single threaded programs. */ ++ if (cpumask_weight(mm_cpumask(mm)) <= threshold) ++ return false; ++ ++ /* Slower check to make sure. */ ++ for_each_cpu(cpu, mm_cpumask(mm)) { ++ /* Skip the CPUs that aren't really running this process. */ ++ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm) ++ continue; ++ ++ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) ++ continue; ++ ++ if (++count > threshold) ++ return true; ++ } ++ return false; ++} ++ ++/* ++ * Assign a global ASID to the current process, protecting against ++ * races between multiple threads in the process. ++ */ ++static void use_global_asid(struct mm_struct *mm) ++{ ++ u16 asid; ++ ++ guard(raw_spinlock_irqsave)(&global_asid_lock); ++ ++ /* This process is already using broadcast TLB invalidation. */ ++ if (mm_global_asid(mm)) ++ return; ++ ++ /* ++ * The last global ASID was consumed while waiting for the lock. ++ * ++ * If this fires, a more aggressive ASID reuse scheme might be ++ * needed. ++ */ ++ if (!global_asid_available) { ++ VM_WARN_ONCE(1, "Ran out of global ASIDs\n"); ++ return; ++ } ++ ++ asid = allocate_global_asid(); ++ if (!asid) ++ return; ++ ++ mm_assign_global_asid(mm, asid); ++} ++ ++void mm_free_global_asid(struct mm_struct *mm) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return; ++ ++ if (!mm_global_asid(mm)) ++ return; ++ ++ guard(raw_spinlock_irqsave)(&global_asid_lock); ++ ++ /* The global ASID can be re-used only after flush at wrap-around. */ ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++ __set_bit(mm->context.global_asid, global_asid_freed); ++ ++ mm->context.global_asid = 0; ++ global_asid_available++; ++#endif ++} ++ ++/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * diff --git a/debian/patches/patchset-zen/invlpgb/0009-x86-mm-Handle-global-ASID-context-switch-and-TLB-flu.patch b/debian/patches/patchset-zen/invlpgb/0009-x86-mm-Handle-global-ASID-context-switch-and-TLB-flu.patch new file mode 100644 index 0000000..9d70eb5 --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0009-x86-mm-Handle-global-ASID-context-switch-and-TLB-flu.patch @@ -0,0 +1,215 @@ +From b3eb743c32515bf8fca7b619dd2a2c64b5812dd8 Mon Sep 17 00:00:00 2001 +From: Rik van Riel <riel@surriel.com> +Date: Tue, 25 Feb 2025 22:00:43 -0500 +Subject: x86/mm: Handle global ASID context switch and TLB flush + +Do context switch and TLB flush support for processes that use a global +ASID and PCID across all CPUs. + +At both context switch time and TLB flush time, it needs to be checked whether +a task is switching to a global ASID, and, if so, reload the TLB with the new +ASID as appropriate. + +In both code paths, the TLB flush is avoided if a global ASID is used, because +the global ASIDs are always kept up to date across CPUs, even when the +process is not running on a CPU. + + [ bp: + - Massage + - :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi + ] + +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Link: https://lore.kernel.org/r/20250226030129.530345-9-riel@surriel.com +--- + arch/x86/include/asm/tlbflush.h | 14 ++++++ + arch/x86/mm/tlb.c | 77 ++++++++++++++++++++++++++++++--- + 2 files changed, 84 insertions(+), 7 deletions(-) + +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -240,6 +240,11 @@ static inline bool is_dyn_asid(u16 asid) + return asid < TLB_NR_DYN_ASIDS; + } + ++static inline bool is_global_asid(u16 asid) ++{ ++ return !is_dyn_asid(asid); ++} ++ + #ifdef CONFIG_BROADCAST_TLB_FLUSH + static inline u16 mm_global_asid(struct mm_struct *mm) + { +@@ -266,9 +271,18 @@ static inline void mm_assign_global_asid + mm->context.asid_transition = true; + smp_store_release(&mm->context.global_asid, asid); + } ++ ++static inline bool mm_in_asid_transition(struct mm_struct *mm) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return false; ++ ++ return mm && READ_ONCE(mm->context.asid_transition); ++} + #else + static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; } + static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { } ++static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; } + #endif /* CONFIG_BROADCAST_TLB_FLUSH */ + + #ifdef CONFIG_PARAVIRT +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -227,6 +227,20 @@ static void choose_new_asid(struct mm_st + return; + } + ++ /* ++ * TLB consistency for global ASIDs is maintained with hardware assisted ++ * remote TLB flushing. Global ASIDs are always up to date. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ u16 global_asid = mm_global_asid(next); ++ ++ if (global_asid) { ++ *new_asid = global_asid; ++ *need_flush = false; ++ return; ++ } ++ } ++ + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + +@@ -400,6 +414,23 @@ void mm_free_global_asid(struct mm_struc + } + + /* ++ * Is the mm transitioning from a CPU-local ASID to a global ASID? ++ */ ++static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid) ++{ ++ u16 global_asid = mm_global_asid(mm); ++ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return false; ++ ++ /* Process is transitioning to a global ASID */ ++ if (global_asid && asid != global_asid) ++ return true; ++ ++ return false; ++} ++ ++/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * +@@ -704,7 +735,8 @@ void switch_mm_irqs_off(struct mm_struct + */ + if (prev == next) { + /* Not actually switching mm's */ +- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != ++ VM_WARN_ON(is_dyn_asid(prev_asid) && ++ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + /* +@@ -721,6 +753,20 @@ void switch_mm_irqs_off(struct mm_struct + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + ++ /* Check if the current mm is transitioning to a global ASID */ ++ if (mm_needs_global_asid(next, prev_asid)) { ++ next_tlb_gen = atomic64_read(&next->context.tlb_gen); ++ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); ++ goto reload_tlb; ++ } ++ ++ /* ++ * Broadcast TLB invalidation keeps this ASID up to date ++ * all the time. ++ */ ++ if (is_global_asid(prev_asid)) ++ return; ++ + /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same +@@ -755,6 +801,13 @@ void switch_mm_irqs_off(struct mm_struct + cond_mitigation(tsk); + + /* ++ * Let nmi_uaccess_okay() and finish_asid_transition() ++ * know that CR3 is changing. ++ */ ++ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); ++ barrier(); ++ ++ /* + * Leave this CPU in prev's mm_cpumask. Atomic writes to + * mm_cpumask can be expensive under contention. The CPU + * will be removed lazily at TLB flush time. +@@ -768,14 +821,12 @@ void switch_mm_irqs_off(struct mm_struct + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); +- +- /* Let nmi_uaccess_okay() know that we're changing CR3. */ +- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); +- barrier(); + } + ++reload_tlb: + new_lam = mm_lam_cr3_mask(next); + if (need_flush) { ++ VM_WARN_ON_ONCE(is_global_asid(new_asid)); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, new_lam, true); +@@ -894,7 +945,7 @@ static void flush_tlb_func(void *info) + const struct flush_tlb_info *f = info; + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); +- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); ++ u64 local_tlb_gen; + bool local = smp_processor_id() == f->initiating_cpu; + unsigned long nr_invalidate = 0; + u64 mm_tlb_gen; +@@ -917,6 +968,16 @@ static void flush_tlb_func(void *info) + if (unlikely(loaded_mm == &init_mm)) + return; + ++ /* Reload the ASID if transitioning into or out of a global ASID */ ++ if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) { ++ switch_mm_irqs_off(NULL, loaded_mm, NULL); ++ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); ++ } ++ ++ /* Broadcast ASIDs are always kept up to date with INVLPGB. */ ++ if (is_global_asid(loaded_mm_asid)) ++ return; ++ + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + +@@ -934,6 +995,8 @@ static void flush_tlb_func(void *info) + return; + } + ++ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); ++ + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && + f->new_tlb_gen <= local_tlb_gen)) { + /* +@@ -1101,7 +1164,7 @@ STATIC_NOPV void native_flush_tlb_multi( + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ +- if (info->freed_tables) ++ if (info->freed_tables || mm_in_asid_transition(info->mm)) + on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); + else + on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, diff --git a/debian/patches/patchset-zen/invlpgb/0010-x86-mm-Add-global-ASID-process-exit-helpers.patch b/debian/patches/patchset-zen/invlpgb/0010-x86-mm-Add-global-ASID-process-exit-helpers.patch new file mode 100644 index 0000000..23a0570 --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0010-x86-mm-Add-global-ASID-process-exit-helpers.patch @@ -0,0 +1,88 @@ +From c63f1d0a496de7a926b92b52061905edfc8428a4 Mon Sep 17 00:00:00 2001 +From: Rik van Riel <riel@surriel.com> +Date: Tue, 25 Feb 2025 22:00:44 -0500 +Subject: x86/mm: Add global ASID process exit helpers + +A global ASID is allocated for the lifetime of a process. Free the global ASID +at process exit time. + + [ bp: Massage, create helpers, hide details inside them. ] + +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Link: https://lore.kernel.org/r/20250226030129.530345-10-riel@surriel.com +--- + arch/x86/include/asm/mmu_context.h | 8 +++++++- + arch/x86/include/asm/tlbflush.h | 9 +++++++++ + 2 files changed, 16 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -2,7 +2,6 @@ + #ifndef _ASM_X86_MMU_CONTEXT_H + #define _ASM_X86_MMU_CONTEXT_H + +-#include <asm/desc.h> + #include <linux/atomic.h> + #include <linux/mm_types.h> + #include <linux/pkeys.h> +@@ -13,6 +12,7 @@ + #include <asm/paravirt.h> + #include <asm/debugreg.h> + #include <asm/gsseg.h> ++#include <asm/desc.h> + + extern atomic64_t last_mm_ctx_id; + +@@ -139,6 +139,9 @@ static inline void mm_reset_untag_mask(s + #define enter_lazy_tlb enter_lazy_tlb + extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); + ++#define mm_init_global_asid mm_init_global_asid ++extern void mm_init_global_asid(struct mm_struct *mm); ++ + extern void mm_free_global_asid(struct mm_struct *mm); + + /* +@@ -163,6 +166,8 @@ static inline int init_new_context(struc + mm->context.execute_only_pkey = -1; + } + #endif ++ ++ mm_init_global_asid(mm); + mm_reset_untag_mask(mm); + init_new_context_ldt(mm); + return 0; +@@ -172,6 +177,7 @@ static inline int init_new_context(struc + static inline void destroy_context(struct mm_struct *mm) + { + destroy_context_ldt(mm); ++ mm_free_global_asid(mm); + } + + extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -261,6 +261,14 @@ static inline u16 mm_global_asid(struct + return asid; + } + ++static inline void mm_init_global_asid(struct mm_struct *mm) ++{ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ mm->context.global_asid = 0; ++ mm->context.asid_transition = false; ++ } ++} ++ + static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) + { + /* +@@ -281,6 +289,7 @@ static inline bool mm_in_asid_transition + } + #else + static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; } ++static inline void mm_init_global_asid(struct mm_struct *mm) { } + static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { } + static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; } + #endif /* CONFIG_BROADCAST_TLB_FLUSH */ diff --git a/debian/patches/patchset-zen/invlpgb/0011-x86-mm-Enable-broadcast-TLB-invalidation-for-multi-t.patch b/debian/patches/patchset-zen/invlpgb/0011-x86-mm-Enable-broadcast-TLB-invalidation-for-multi-t.patch new file mode 100644 index 0000000..48aced4 --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0011-x86-mm-Enable-broadcast-TLB-invalidation-for-multi-t.patch @@ -0,0 +1,219 @@ +From e16bb18388207841efa841b9b11e69c886817024 Mon Sep 17 00:00:00 2001 +From: Rik van Riel <riel@surriel.com> +Date: Tue, 25 Feb 2025 22:00:45 -0500 +Subject: x86/mm: Enable broadcast TLB invalidation for multi-threaded + processes + +There is not enough room in the 12-bit ASID address space to hand out +broadcast ASIDs to every process. Only hand out broadcast ASIDs to processes +when they are observed to be simultaneously running on 4 or more CPUs. + +This also allows single threaded process to continue using the cheaper, local +TLB invalidation instructions like INVLPGB. + +Due to the structure of flush_tlb_mm_range(), the INVLPGB flushing is done in +a generically named broadcast_tlb_flush() function which can later also be +used for Intel RAR. + +Combined with the removal of unnecessary lru_add_drain calls() (see +https://lore.kernel.org/r/20241219153253.3da9e8aa@fangorn) this results in +a nice performance boost for the will-it-scale tlb_flush2_threads test on an +AMD Milan system with 36 cores: + + - vanilla kernel: 527k loops/second + - lru_add_drain removal: 731k loops/second + - only INVLPGB: 527k loops/second + - lru_add_drain + INVLPGB: 1157k loops/second + +Profiling with only the INVLPGB changes showed while TLB invalidation went +down from 40% of the total CPU time to only around 4% of CPU time, the +contention simply moved to the LRU lock. + +Fixing both at the same time about doubles the number of iterations per second +from this case. + +Comparing will-it-scale tlb_flush2_threads with several different numbers of +threads on a 72 CPU AMD Milan shows similar results. The number represents the +total number of loops per second across all the threads: + + threads tip INVLPGB + + 1 315k 304k + 2 423k 424k + 4 644k 1032k + 8 652k 1267k + 16 737k 1368k + 32 759k 1199k + 64 636k 1094k + 72 609k 993k + +1 and 2 thread performance is similar with and without INVLPGB, because +INVLPGB is only used on processes using 4 or more CPUs simultaneously. + +The number is the median across 5 runs. + +Some numbers closer to real world performance can be found at Phoronix, thanks +to Michael: + +https://www.phoronix.com/news/AMD-INVLPGB-Linux-Benefits + + [ bp: + - Massage + - :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi + - :%s/\<clear_asid_transition\>/mm_clear_asid_transition/cgi + - Fold in a 0day bot fix: https://lore.kernel.org/oe-kbuild-all/202503040000.GtiWUsBm-lkp@intel.com + ] + +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Reviewed-by: Nadav Amit <nadav.amit@gmail.com> +Link: https://lore.kernel.org/r/20250226030129.530345-11-riel@surriel.com +--- + arch/x86/include/asm/tlbflush.h | 6 ++ + arch/x86/mm/tlb.c | 104 +++++++++++++++++++++++++++++++- + 2 files changed, 109 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -280,6 +280,11 @@ static inline void mm_assign_global_asid + smp_store_release(&mm->context.global_asid, asid); + } + ++static inline void mm_clear_asid_transition(struct mm_struct *mm) ++{ ++ WRITE_ONCE(mm->context.asid_transition, false); ++} ++ + static inline bool mm_in_asid_transition(struct mm_struct *mm) + { + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) +@@ -291,6 +296,7 @@ static inline bool mm_in_asid_transition + static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; } + static inline void mm_init_global_asid(struct mm_struct *mm) { } + static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { } ++static inline void mm_clear_asid_transition(struct mm_struct *mm) { } + static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; } + #endif /* CONFIG_BROADCAST_TLB_FLUSH */ + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -431,6 +431,105 @@ static bool mm_needs_global_asid(struct + } + + /* ++ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86 ++ * systems have over 8k CPUs. Because of this potential ASID shortage, ++ * global ASIDs are handed out to processes that have frequent TLB ++ * flushes and are active on 4 or more CPUs simultaneously. ++ */ ++static void consider_global_asid(struct mm_struct *mm) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return; ++ ++ /* Check every once in a while. */ ++ if ((current->pid & 0x1f) != (jiffies & 0x1f)) ++ return; ++ ++ /* ++ * Assign a global ASID if the process is active on ++ * 4 or more CPUs simultaneously. ++ */ ++ if (mm_active_cpus_exceeds(mm, 3)) ++ use_global_asid(mm); ++} ++ ++static void finish_asid_transition(struct flush_tlb_info *info) ++{ ++ struct mm_struct *mm = info->mm; ++ int bc_asid = mm_global_asid(mm); ++ int cpu; ++ ++ if (!mm_in_asid_transition(mm)) ++ return; ++ ++ for_each_cpu(cpu, mm_cpumask(mm)) { ++ /* ++ * The remote CPU is context switching. Wait for that to ++ * finish, to catch the unlikely case of it switching to ++ * the target mm with an out of date ASID. ++ */ ++ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING) ++ cpu_relax(); ++ ++ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm) ++ continue; ++ ++ /* ++ * If at least one CPU is not using the global ASID yet, ++ * send a TLB flush IPI. The IPI should cause stragglers ++ * to transition soon. ++ * ++ * This can race with the CPU switching to another task; ++ * that results in a (harmless) extra IPI. ++ */ ++ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) { ++ flush_tlb_multi(mm_cpumask(info->mm), info); ++ return; ++ } ++ } ++ ++ /* All the CPUs running this process are using the global ASID. */ ++ mm_clear_asid_transition(mm); ++} ++ ++static void broadcast_tlb_flush(struct flush_tlb_info *info) ++{ ++ bool pmd = info->stride_shift == PMD_SHIFT; ++ unsigned long asid = mm_global_asid(info->mm); ++ unsigned long addr = info->start; ++ ++ /* ++ * TLB flushes with INVLPGB are kicked off asynchronously. ++ * The inc_mm_tlb_gen() guarantees page table updates are done ++ * before these TLB flushes happen. ++ */ ++ if (info->end == TLB_FLUSH_ALL) { ++ invlpgb_flush_single_pcid_nosync(kern_pcid(asid)); ++ /* Do any CPUs supporting INVLPGB need PTI? */ ++ if (cpu_feature_enabled(X86_FEATURE_PTI)) ++ invlpgb_flush_single_pcid_nosync(user_pcid(asid)); ++ } else do { ++ unsigned long nr = 1; ++ ++ if (info->stride_shift <= PMD_SHIFT) { ++ nr = (info->end - addr) >> info->stride_shift; ++ nr = clamp_val(nr, 1, invlpgb_count_max); ++ } ++ ++ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); ++ if (cpu_feature_enabled(X86_FEATURE_PTI)) ++ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd); ++ ++ addr += nr << info->stride_shift; ++ } while (addr < info->end); ++ ++ finish_asid_transition(info); ++ ++ /* Wait for the INVLPGBs kicked off above to finish. */ ++ __tlbsync(); ++} ++ ++/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * +@@ -1260,9 +1359,12 @@ void flush_tlb_mm_range(struct mm_struct + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ +- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { ++ if (mm_global_asid(mm)) { ++ broadcast_tlb_flush(info); ++ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + info->trim_cpumask = should_trim_cpumask(mm); + flush_tlb_multi(mm_cpumask(mm), info); ++ consider_global_asid(mm); + } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + lockdep_assert_irqs_enabled(); + local_irq_disable(); diff --git a/debian/patches/patchset-zen/invlpgb-v9/0011-x86-mm-enable-AMD-translation-cache-extensions.patch b/debian/patches/patchset-zen/invlpgb/0012-x86-mm-Enable-AMD-translation-cache-extensions.patch similarity index 69% rename from debian/patches/patchset-zen/invlpgb-v9/0011-x86-mm-enable-AMD-translation-cache-extensions.patch rename to debian/patches/patchset-zen/invlpgb/0012-x86-mm-Enable-AMD-translation-cache-extensions.patch index fe11c6f..f9d3733 100644 --- a/debian/patches/patchset-zen/invlpgb-v9/0011-x86-mm-enable-AMD-translation-cache-extensions.patch +++ b/debian/patches/patchset-zen/invlpgb/0012-x86-mm-Enable-AMD-translation-cache-extensions.patch @@ -1,28 +1,31 @@ -From 101ba03a6474bbc52971505abf1e3ee9613f255b Mon Sep 17 00:00:00 2001 +From 9c88454149bd22cc3d8618b4445d32aaf48cadce Mon Sep 17 00:00:00 2001 From: Rik van Riel <riel@surriel.com> -Date: Wed, 5 Feb 2025 23:43:30 -0500 -Subject: x86/mm: enable AMD translation cache extensions +Date: Tue, 25 Feb 2025 22:00:47 -0500 +Subject: x86/mm: Enable AMD translation cache extensions With AMD TCE (translation cache extensions) only the intermediate mappings that cover the address range zapped by INVLPG / INVLPGB get invalidated, rather than all intermediate mappings getting zapped at every TLB invalidation. -This can help reduce the TLB miss rate, by keeping more intermediate -mappings in the cache. +This can help reduce the TLB miss rate, by keeping more intermediate mappings +in the cache. From the AMD manual: -Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit -to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on -TLB entries. When this bit is 0, these instructions remove the target PTE -from the TLB as well as all upper-level table entries that are cached -in the TLB, whether or not they are associated with the target PTE. -When this bit is set, these instructions will remove the target PTE and -only those upper-level entries that lead to the target PTE in -the page table hierarchy, leaving unrelated upper-level entries intact. +Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit to +1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on TLB +entries. When this bit is 0, these instructions remove the target PTE from the +TLB as well as all upper-level table entries that are cached in the TLB, +whether or not they are associated with the target PTE. When this bit is set, +these instructions will remove the target PTE and only those upper-level +entries that lead to the target PTE in the page table hierarchy, leaving +unrelated upper-level entries intact. + + [ bp: use cpu_has()... I know, it is a mess. ] Signed-off-by: Rik van Riel <riel@surriel.com> -Tested-by: Manali Shukla <Manali.Shukla@amd.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Link: https://lore.kernel.org/r/20250226030129.530345-13-riel@surriel.com --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/amd.c | 4 ++++ @@ -49,13 +52,13 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com> /* --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c -@@ -1071,6 +1071,10 @@ static void init_amd(struct cpuinfo_x86 +@@ -1075,6 +1075,10 @@ static void init_amd(struct cpuinfo_x86 /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + + /* Enable Translation Cache Extension */ -+ if (cpu_feature_enabled(X86_FEATURE_TCE)) ++ if (cpu_has(c, X86_FEATURE_TCE)) + msr_set_bit(MSR_EFER, _EFER_TCE); } diff --git a/debian/patches/patchset-zen/invlpgb/0013-x86-mm-Always-set-the-ASID-valid-bit-for-the-INVLPGB.patch b/debian/patches/patchset-zen/invlpgb/0013-x86-mm-Always-set-the-ASID-valid-bit-for-the-INVLPGB.patch new file mode 100644 index 0000000..c3525b1 --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0013-x86-mm-Always-set-the-ASID-valid-bit-for-the-INVLPGB.patch @@ -0,0 +1,121 @@ +From 20dfd0edb14a1d0aecd5eb227f2db64487201976 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky <thomas.lendacky@amd.com> +Date: Tue, 4 Mar 2025 12:59:56 +0100 +Subject: x86/mm: Always set the ASID valid bit for the INVLPGB instruction + +When executing the INVLPGB instruction on a bare-metal host or hypervisor, if +the ASID valid bit is not set, the instruction will flush the TLB entries that +match the specified criteria for any ASID, not just the those of the host. If +virtual machines are running on the system, this may result in inadvertent +flushes of guest TLB entries. + +When executing the INVLPGB instruction in a guest and the INVLPGB instruction is +not intercepted by the hypervisor, the hardware will replace the requested ASID +with the guest ASID and set the ASID valid bit before doing the broadcast +invalidation. Thus a guest is only able to flush its own TLB entries. + +So to limit the host TLB flushing reach, always set the ASID valid bit using an +ASID value of 0 (which represents the host/hypervisor). This will will result in +the desired effect in both host and guest. + +Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> +Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> +Link: https://lore.kernel.org/r/20250304120449.GHZ8bsYYyEBOKQIxBm@fat_crate.local +--- + arch/x86/include/asm/tlb.h | 58 +++++++++++++++++++++----------------- + 1 file changed, 32 insertions(+), 26 deletions(-) + +--- a/arch/x86/include/asm/tlb.h ++++ b/arch/x86/include/asm/tlb.h +@@ -33,6 +33,27 @@ enum addr_stride { + PMD_STRIDE = 1 + }; + ++/* ++ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination ++ * of the three. For example: ++ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address ++ * - FLAG_PCID: invalidate all TLB entries matching the PCID ++ * ++ * The first is used to invalidate (kernel) mappings at a particular ++ * address across all processes. ++ * ++ * The latter invalidates all TLB entries matching a PCID. ++ */ ++#define INVLPGB_FLAG_VA BIT(0) ++#define INVLPGB_FLAG_PCID BIT(1) ++#define INVLPGB_FLAG_ASID BIT(2) ++#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) ++#define INVLPGB_FLAG_FINAL_ONLY BIT(4) ++#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) ++ ++/* The implied mode when all bits are clear: */ ++#define INVLPGB_MODE_ALL_NONGLOBALS 0UL ++ + #ifdef CONFIG_BROADCAST_TLB_FLUSH + /* + * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. +@@ -40,14 +61,20 @@ enum addr_stride { + * The INVLPGB instruction is weakly ordered, and a batch of invalidations can + * be done in a parallel fashion. + * +- * The instruction takes the number of extra pages to invalidate, beyond +- * the first page, while __invlpgb gets the more human readable number of +- * pages to invalidate. ++ * The instruction takes the number of extra pages to invalidate, beyond the ++ * first page, while __invlpgb gets the more human readable number of pages to ++ * invalidate. + * + * The bits in rax[0:2] determine respectively which components of the address + * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any* + * address in the specified range matches. + * ++ * Since it is desired to only flush TLB entries for the ASID that is executing ++ * the instruction (a host/hypervisor or a guest), the ASID valid bit should ++ * always be set. On a host/hypervisor, the hardware will use the ASID value ++ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will ++ * use the actual ASID value of the guest. ++ * + * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from + * this CPU have completed. + */ +@@ -55,9 +82,9 @@ static inline void __invlpgb(unsigned lo + unsigned long addr, u16 nr_pages, + enum addr_stride stride, u8 flags) + { +- u32 edx = (pcid << 16) | asid; ++ u64 rax = addr | flags | INVLPGB_FLAG_ASID; + u32 ecx = (stride << 31) | (nr_pages - 1); +- u64 rax = addr | flags; ++ u32 edx = (pcid << 16) | asid; + + /* The low bits in rax are for flags. Verify addr is clean. */ + VM_WARN_ON_ONCE(addr & ~PAGE_MASK); +@@ -93,27 +120,6 @@ static inline void __invlpgb_all(unsigne + static inline void __tlbsync(void) { } + #endif + +-/* +- * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination +- * of the three. For example: +- * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address +- * - FLAG_PCID: invalidate all TLB entries matching the PCID +- * +- * The first is used to invalidate (kernel) mappings at a particular +- * address across all processes. +- * +- * The latter invalidates all TLB entries matching a PCID. +- */ +-#define INVLPGB_FLAG_VA BIT(0) +-#define INVLPGB_FLAG_PCID BIT(1) +-#define INVLPGB_FLAG_ASID BIT(2) +-#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) +-#define INVLPGB_FLAG_FINAL_ONLY BIT(4) +-#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) +- +-/* The implied mode when all bits are clear: */ +-#define INVLPGB_MODE_ALL_NONGLOBALS 0UL +- + static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, + unsigned long addr, + u16 nr, bool stride) diff --git a/debian/patches/patchset-zen/invlpgb/0014-x86-mm-Only-do-broadcast-flush-from-reclaim-if-pages.patch b/debian/patches/patchset-zen/invlpgb/0014-x86-mm-Only-do-broadcast-flush-from-reclaim-if-pages.patch new file mode 100644 index 0000000..00a5448 --- /dev/null +++ b/debian/patches/patchset-zen/invlpgb/0014-x86-mm-Only-do-broadcast-flush-from-reclaim-if-pages.patch @@ -0,0 +1,70 @@ +From b5a210ad153e5448876c422f5c77d3dcd83abac6 Mon Sep 17 00:00:00 2001 +From: Rik van Riel <riel@surriel.com> +Date: Wed, 19 Mar 2025 13:25:20 -0400 +Subject: x86/mm: Only do broadcast flush from reclaim if pages were unmapped + +Track whether pages were unmapped from any MM (even ones with a currently +empty mm_cpumask) by the reclaim code, to figure out whether or not +broadcast TLB flush should be done when reclaim finishes. + +The reason any MM must be tracked, and not only ones contributing to the +tlbbatch cpumask, is that broadcast ASIDs are expected to be kept up to +date even on CPUs where the MM is not currently active. + +This change allows reclaim to avoid doing TLB flushes when only clean page +cache pages and/or slab memory were reclaimed, which is fairly common. + +( This is a simpler alternative to the code that was in my INVLPGB series + before, and it seems to capture most of the benefit due to how common + it is to reclaim only page cache. ) + +Signed-off-by: Rik van Riel <riel@surriel.com> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Link: https://lore.kernel.org/r/20250319132520.6b10ad90@fangorn +--- + arch/x86/include/asm/tlbbatch.h | 5 +++++ + arch/x86/include/asm/tlbflush.h | 1 + + arch/x86/mm/tlb.c | 3 ++- + 3 files changed, 8 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/tlbbatch.h ++++ b/arch/x86/include/asm/tlbbatch.h +@@ -10,6 +10,11 @@ struct arch_tlbflush_unmap_batch { + * the PFNs being flushed.. + */ + struct cpumask cpumask; ++ /* ++ * Set if pages were unmapped from any MM, even one that does not ++ * have active CPUs in its cpumask. ++ */ ++ bool unmapped_pages; + }; + + #endif /* _ARCH_X86_TLBBATCH_H */ +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -353,6 +353,7 @@ static inline void arch_tlbbatch_add_pen + { + inc_mm_tlb_gen(mm); + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); ++ batch->unmapped_pages = true; + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); + } + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1633,8 +1633,9 @@ void arch_tlbbatch_flush(struct arch_tlb + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ +- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) { + invlpgb_flush_all_nonglobals(); ++ batch->unmapped_pages = false; + } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + flush_tlb_multi(&batch->cpumask, info); + } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { diff --git a/debian/patches/patchset-pf/pksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch b/debian/patches/patchset-zen/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch similarity index 57% rename from debian/patches/patchset-pf/pksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch rename to debian/patches/patchset-zen/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch index db78a6b..3382798 100644 --- a/debian/patches/patchset-pf/pksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch +++ b/debian/patches/patchset-zen/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch @@ -1,4 +1,4 @@ -From 4eb6615c1498cb8bff76c31e9596b585410f507d Mon Sep 17 00:00:00 2001 +From 4ad0ae3b81cd90c0729df9ac5f1ff21f4dad6130 Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko <oleksandr@natalenko.name> Date: Mon, 30 Sep 2024 08:58:38 +0200 Subject: mm: expose per-process KSM control via syscalls @@ -43,145 +43,145 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl -@@ -502,3 +502,6 @@ - 570 common lsm_set_self_attr sys_lsm_set_self_attr - 571 common lsm_list_modules sys_lsm_list_modules - 572 common mseal sys_mseal -+573 common process_ksm_enable sys_process_ksm_enable -+574 common process_ksm_disable sys_process_ksm_disable -+575 common process_ksm_status sys_process_ksm_status +@@ -506,3 +506,6 @@ + 574 common getxattrat sys_getxattrat + 575 common listxattrat sys_listxattrat + 576 common removexattrat sys_removexattrat ++577 common process_ksm_enable sys_process_ksm_enable ++578 common process_ksm_disable sys_process_ksm_disable ++579 common process_ksm_status sys_process_ksm_status --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl -@@ -477,3 +477,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -481,3 +481,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl -@@ -462,3 +462,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -466,3 +466,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl -@@ -468,3 +468,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -472,3 +472,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl -@@ -401,3 +401,6 @@ - 460 n32 lsm_set_self_attr sys_lsm_set_self_attr - 461 n32 lsm_list_modules sys_lsm_list_modules - 462 n32 mseal sys_mseal -+463 n32 process_ksm_enable sys_process_ksm_enable -+464 n32 process_ksm_disable sys_process_ksm_disable -+465 n32 process_ksm_status sys_process_ksm_status +@@ -405,3 +405,6 @@ + 464 n32 getxattrat sys_getxattrat + 465 n32 listxattrat sys_listxattrat + 466 n32 removexattrat sys_removexattrat ++467 n32 process_ksm_enable sys_process_ksm_enable ++468 n32 process_ksm_disable sys_process_ksm_disable ++469 n32 process_ksm_status sys_process_ksm_status --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl -@@ -377,3 +377,6 @@ - 460 n64 lsm_set_self_attr sys_lsm_set_self_attr - 461 n64 lsm_list_modules sys_lsm_list_modules - 462 n64 mseal sys_mseal -+463 n64 process_ksm_enable sys_process_ksm_enable -+464 n64 process_ksm_disable sys_process_ksm_disable -+465 n64 process_ksm_status sys_process_ksm_status +@@ -381,3 +381,6 @@ + 464 n64 getxattrat sys_getxattrat + 465 n64 listxattrat sys_listxattrat + 466 n64 removexattrat sys_removexattrat ++467 n64 process_ksm_enable sys_process_ksm_enable ++468 n64 process_ksm_disable sys_process_ksm_disable ++469 n64 process_ksm_status sys_process_ksm_status --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl -@@ -450,3 +450,6 @@ - 460 o32 lsm_set_self_attr sys_lsm_set_self_attr - 461 o32 lsm_list_modules sys_lsm_list_modules - 462 o32 mseal sys_mseal -+463 o32 process_ksm_enable sys_process_ksm_enable -+464 o32 process_ksm_disable sys_process_ksm_disable -+465 o32 process_ksm_status sys_process_ksm_status +@@ -454,3 +454,6 @@ + 464 o32 getxattrat sys_getxattrat + 465 o32 listxattrat sys_listxattrat + 466 o32 removexattrat sys_removexattrat ++467 o32 process_ksm_enable sys_process_ksm_enable ++468 o32 process_ksm_disable sys_process_ksm_disable ++469 o32 process_ksm_status sys_process_ksm_status --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl -@@ -461,3 +461,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -465,3 +465,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl -@@ -553,3 +553,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -557,3 +557,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl -@@ -465,3 +465,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status +@@ -469,3 +469,6 @@ + 464 common getxattrat sys_getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status sys_process_ksm_status --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl -@@ -466,3 +466,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -470,3 +470,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl -@@ -508,3 +508,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -512,3 +512,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl -@@ -468,3 +468,6 @@ - 460 i386 lsm_set_self_attr sys_lsm_set_self_attr - 461 i386 lsm_list_modules sys_lsm_list_modules - 462 i386 mseal sys_mseal -+463 i386 process_ksm_enable sys_process_ksm_enable -+464 i386 process_ksm_disable sys_process_ksm_disable -+465 i386 process_ksm_status sys_process_ksm_status +@@ -472,3 +472,6 @@ + 464 i386 getxattrat sys_getxattrat + 465 i386 listxattrat sys_listxattrat + 466 i386 removexattrat sys_removexattrat ++467 i386 process_ksm_enable sys_process_ksm_enable ++468 i386 process_ksm_disable sys_process_ksm_disable ++469 i386 process_ksm_status sys_process_ksm_status --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl -@@ -386,6 +386,9 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -390,6 +390,9 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status # # Due to a historical design error, certain syscalls are numbered differently --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl -@@ -433,3 +433,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -437,3 +437,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h -@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned lon +@@ -831,6 +831,9 @@ asmlinkage long sys_madvise(unsigned lon asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); @@ -193,26 +193,26 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> unsigned long flags); --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h -@@ -841,8 +841,15 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm - #define __NR_mseal 462 - __SYSCALL(__NR_mseal, sys_mseal) +@@ -850,8 +850,15 @@ __SYSCALL(__NR_listxattrat, sys_listxatt + #define __NR_removexattrat 466 + __SYSCALL(__NR_removexattrat, sys_removexattrat) -+#define __NR_process_ksm_enable 463 ++#define __NR_process_ksm_enable 467 +__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable) -+#define __NR_process_ksm_disable 464 ++#define __NR_process_ksm_disable 468 +__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable) -+#define __NR_process_ksm_status 465 ++#define __NR_process_ksm_status 469 +__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status) + #undef __NR_syscalls --#define __NR_syscalls 463 -+#define __NR_syscalls 466 +-#define __NR_syscalls 467 ++#define __NR_syscalls 470 /* * 32 bit systems traditionally used different --- a/kernel/sys.c +++ b/kernel/sys.c -@@ -2791,6 +2791,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsi +@@ -2819,6 +2819,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsi return error; } @@ -371,28 +371,28 @@ Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name> COND_SYSCALL(get_mempolicy); --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl -@@ -403,3 +403,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -407,3 +407,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl -@@ -553,3 +553,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status +@@ -557,3 +557,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl -@@ -465,3 +465,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status +@@ -469,3 +469,6 @@ + 464 common getxattrat sys_getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status sys_process_ksm_status diff --git a/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch b/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch index 34f02dc..e873db2 100644 --- a/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch +++ b/debian/patches/patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch @@ -1,4 +1,4 @@ -From 95490afcba944883e7f911214391a1a1e2fa3261 Mon Sep 17 00:00:00 2001 +From 6d141e3121676e9ca50d6465a622b9a5d572219a Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org> Date: Mon, 26 Apr 2021 22:12:46 +0200 Subject: ZEN: Add VHBA driver @@ -10,8 +10,8 @@ tag vhba-module-20240917 drivers/scsi/Makefile | 1 + drivers/scsi/vhba/Kconfig | 9 + drivers/scsi/vhba/Makefile | 4 + - drivers/scsi/vhba/vhba.c | 1124 ++++++++++++++++++++++++++++++++++++ - 5 files changed, 1140 insertions(+) + drivers/scsi/vhba/vhba.c | 1130 ++++++++++++++++++++++++++++++++++++ + 5 files changed, 1146 insertions(+) create mode 100644 drivers/scsi/vhba/Kconfig create mode 100644 drivers/scsi/vhba/Makefile create mode 100644 drivers/scsi/vhba/vhba.c @@ -56,7 +56,7 @@ tag vhba-module-20240917 +ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror --- /dev/null +++ b/drivers/scsi/vhba/vhba.c -@@ -0,0 +1,1124 @@ +@@ -0,0 +1,1130 @@ +/* + * vhba.c + * @@ -1108,7 +1108,11 @@ tag vhba-module-20240917 + return 0; +} + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0) +static int vhba_remove (struct platform_device *pdev) ++#else ++static void vhba_remove (struct platform_device *pdev) ++#endif +{ + struct vhba_host *vhost; + struct Scsi_Host *shost; @@ -1121,7 +1125,9 @@ tag vhba-module-20240917 + + kfree(vhost->commands); + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0) + return 0; ++#endif +} + +static void vhba_release (struct device * dev) diff --git a/debian/patches/patchset-zen/sauce/0002-VHBA-fix-building-with-kernel-6.14-rc1.patch b/debian/patches/patchset-zen/sauce/0002-VHBA-fix-building-with-kernel-6.14-rc1.patch new file mode 100644 index 0000000..2040f9e --- /dev/null +++ b/debian/patches/patchset-zen/sauce/0002-VHBA-fix-building-with-kernel-6.14-rc1.patch @@ -0,0 +1,28 @@ +From 1f9910c9a54b424ad0cd415b981986937618c4ec Mon Sep 17 00:00:00 2001 +From: Rok Mandeljc <rok.mandeljc@gmail.com> +Date: Mon, 3 Feb 2025 21:05:32 +0100 +Subject: VHBA: fix building with kernel 6.14-rc1 + +Kernel 6.14-rc1 simplified the selection of tag allocation policy. +Instead of enum-based value, a boolean is used, and the corresponding +field in the `scsi_host_template` structure was renamed from +`tag_alloc_policy` to `tag_alloc_policy_rr`. + +See: https://github.com/torvalds/linux/commit/ce32496 +--- + drivers/scsi/vhba/vhba.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/scsi/vhba/vhba.c ++++ b/drivers/scsi/vhba/vhba.c +@@ -537,7 +537,9 @@ static struct scsi_host_template vhba_te + #if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) + .slave_alloc = vhba_slave_alloc, + #endif +-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 14, 0) ++ .tag_alloc_policy_rr = true, ++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) + .tag_alloc_policy = BLK_TAG_ALLOC_RR, + #endif + #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) diff --git a/debian/patches/patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch b/debian/patches/patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch deleted file mode 100644 index 3c0e0ea..0000000 --- a/debian/patches/patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch +++ /dev/null @@ -1,35 +0,0 @@ -From 8a6a60b5a71d7f85351a9350eb651c4ce15b8f00 Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org> -Date: Sun, 15 Sep 2024 19:05:46 +0000 -Subject: vhba: Fix compat with kernel 6.11 - -Upstream commit 0edb555a65d1ef047a9805051c36922b52a38a9d changed the -return value of the `remove` callback from `int` to `void`. ---- - drivers/scsi/vhba/vhba.c | 6 ++++++ - 1 file changed, 6 insertions(+) - ---- a/drivers/scsi/vhba/vhba.c -+++ b/drivers/scsi/vhba/vhba.c -@@ -1049,7 +1049,11 @@ static int vhba_probe (struct platform_d - return 0; - } - -+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0) - static int vhba_remove (struct platform_device *pdev) -+#else -+static void vhba_remove (struct platform_device *pdev) -+#endif - { - struct vhba_host *vhost; - struct Scsi_Host *shost; -@@ -1062,7 +1066,9 @@ static int vhba_remove (struct platform_ - - kfree(vhost->commands); - -+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0) - return 0; -+#endif - } - - static void vhba_release (struct device * dev) diff --git a/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch b/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch index b6ad8a3..dd29fe3 100644 --- a/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch +++ b/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch @@ -1,4 +1,4 @@ -From 1cdff301de6db901bc2bfd7ce78016d9b824d667 Mon Sep 17 00:00:00 2001 +From 02b4d790bb05e24e7408a147f33e4e9ca0b805fa Mon Sep 17 00:00:00 2001 From: Daniel Drake <drake@endlessm.com> Date: Tue, 4 Jun 2019 14:51:21 +0800 Subject: ZEN: PCI: Add Intel remapped NVMe device support @@ -135,8 +135,8 @@ Contains: } static int ahci_get_irq_vector(struct ata_host *host, int port) -@@ -1896,7 +1889,9 @@ static int ahci_init_one(struct pci_dev - hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar]; +@@ -1898,7 +1891,9 @@ static int ahci_init_one(struct pci_dev + return -ENOMEM; /* detect remapped nvme devices */ - ahci_remap_check(pdev, ahci_pci_bar, hpriv); diff --git a/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch b/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch index 11cc0f3..4987d59 100644 --- a/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch +++ b/debian/patches/patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch @@ -1,4 +1,4 @@ -From 87b0cab8d8701db7754e5778b93ff83ffc64c7ae Mon Sep 17 00:00:00 2001 +From 17190525fdc9c9f73fe22832ab0631e9e1bbad6d Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf <sultan@kerneltoast.com> Date: Sun, 8 Mar 2020 00:31:35 -0800 Subject: ZEN: Disable stack conservation for GCC @@ -15,7 +15,7 @@ Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com> --- a/Makefile +++ b/Makefile -@@ -1026,11 +1026,6 @@ KBUILD_CFLAGS += -fno-strict-overflow +@@ -1078,11 +1078,6 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check @@ -24,6 +24,6 @@ Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com> -KBUILD_CFLAGS += -fconserve-stack -endif - - # change __FILE__ to the relative path from the srctree - KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) - + # change __FILE__ to the relative path to the source directory + ifdef building_out_of_srctree + KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srcroot)/=) diff --git a/debian/patches/patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch b/debian/patches/patchset-zen/sauce/0005-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch similarity index 97% rename from debian/patches/patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch rename to debian/patches/patchset-zen/sauce/0005-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch index e19c6a8..2420336 100644 --- a/debian/patches/patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch +++ b/debian/patches/patchset-zen/sauce/0005-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch @@ -1,4 +1,4 @@ -From 2f3e9fbc48151e4499f9cbd810d9467ac34b0a3b Mon Sep 17 00:00:00 2001 +From 2b801ae725ae05be994d374efdce8fc2e828687f Mon Sep 17 00:00:00 2001 From: Kenny Levinsen <kl@kl.wtf> Date: Sun, 27 Dec 2020 14:43:13 +0000 Subject: ZEN: Input: evdev - use call_rcu when detaching client diff --git a/debian/patches/patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch b/debian/patches/patchset-zen/sauce/0006-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch similarity index 94% rename from debian/patches/patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch rename to debian/patches/patchset-zen/sauce/0006-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch index a5a3c91..161728e 100644 --- a/debian/patches/patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch +++ b/debian/patches/patchset-zen/sauce/0006-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch @@ -1,4 +1,4 @@ -From 51026b78d015797e216aadc4e80158181c2c2bb4 Mon Sep 17 00:00:00 2001 +From 3777b5340ebf0460e6fb79205b294dd4333c9d8b Mon Sep 17 00:00:00 2001 From: Steven Barrett <steven@liquorix.net> Date: Mon, 11 Jul 2022 19:10:30 -0500 Subject: ZEN: cpufreq: Remove schedutil dependency on Intel/AMD P-State diff --git a/debian/patches/patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch b/debian/patches/patchset-zen/sauce/0007-ZEN-intel-pstate-Implement-enable-parameter.patch similarity index 93% rename from debian/patches/patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch rename to debian/patches/patchset-zen/sauce/0007-ZEN-intel-pstate-Implement-enable-parameter.patch index 6750826..1429949 100644 --- a/debian/patches/patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch +++ b/debian/patches/patchset-zen/sauce/0007-ZEN-intel-pstate-Implement-enable-parameter.patch @@ -1,4 +1,4 @@ -From 48c8812a4cea0190a037757589443f3103c610ba Mon Sep 17 00:00:00 2001 +From d00df0f150c9d04cd229d42e0af906db3dfb5190 Mon Sep 17 00:00:00 2001 From: Steven Barrett <steven@liquorix.net> Date: Wed, 15 Jan 2020 20:43:56 -0600 Subject: ZEN: intel-pstate: Implement "enable" parameter @@ -30,7 +30,7 @@ selection. --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -2254,6 +2254,9 @@ +@@ -2283,6 +2283,9 @@ disable Do not enable intel_pstate as the default scaling driver for the supported processors @@ -42,7 +42,7 @@ selection. governors layer of cpufreq and provides it own --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c -@@ -3817,6 +3817,8 @@ static int __init intel_pstate_setup(cha +@@ -3827,6 +3827,8 @@ static int __init intel_pstate_setup(cha if (!strcmp(str, "disable")) no_load = 1; diff --git a/debian/patches/patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch b/debian/patches/patchset-zen/sauce/0008-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch similarity index 87% rename from debian/patches/patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch rename to debian/patches/patchset-zen/sauce/0008-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch index eb56d51..1d1b6dd 100644 --- a/debian/patches/patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch +++ b/debian/patches/patchset-zen/sauce/0008-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch @@ -1,4 +1,4 @@ -From bbc56fdeaa2017d0bbed05e1e832e6d7e4bdd6e0 Mon Sep 17 00:00:00 2001 +From f03da22e562a7d65a97926a76f61daeef8a1eb0d Mon Sep 17 00:00:00 2001 From: Steven Barrett <steven@liquorix.net> Date: Fri, 15 Mar 2024 12:36:51 -0500 Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with @@ -13,7 +13,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h -@@ -164,6 +164,7 @@ struct amdgpu_watchdog_timer { +@@ -160,6 +160,7 @@ struct amdgpu_watchdog_timer { */ extern int amdgpu_modeset; extern unsigned int amdgpu_vram_limit; @@ -23,7 +23,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with extern int amdgpu_gtt_size; --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -@@ -138,6 +138,7 @@ enum AMDGPU_DEBUG_MASK { +@@ -139,6 +139,7 @@ enum AMDGPU_DEBUG_MASK { }; unsigned int amdgpu_vram_limit = UINT_MAX; @@ -31,7 +31,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with int amdgpu_vis_vram_limit; int amdgpu_gart_size = -1; /* auto */ int amdgpu_gtt_size = -1; /* auto */ -@@ -262,6 +263,15 @@ struct amdgpu_watchdog_timer amdgpu_watc +@@ -258,6 +259,15 @@ struct amdgpu_watchdog_timer amdgpu_watc }; /** @@ -49,7 +49,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with */ --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c -@@ -3276,6 +3276,9 @@ static ssize_t amdgpu_hwmon_show_power_c +@@ -3180,6 +3180,9 @@ static ssize_t amdgpu_hwmon_show_power_c struct device_attribute *attr, char *buf) { @@ -61,7 +61,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -@@ -2793,7 +2793,10 @@ int smu_get_power_limit(void *handle, +@@ -2823,7 +2823,10 @@ int smu_get_power_limit(void *handle, *limit = smu->max_power_limit; break; case SMU_PPT_LIMIT_MIN: @@ -73,7 +73,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with break; default: return -EINVAL; -@@ -2817,7 +2820,14 @@ static int smu_set_power_limit(void *han +@@ -2847,7 +2850,14 @@ static int smu_set_power_limit(void *han if (smu->ppt_funcs->set_power_limit) return smu->ppt_funcs->set_power_limit(smu, limit_type, limit); diff --git a/debian/patches/patchset-zen/sauce/0010-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch b/debian/patches/patchset-zen/sauce/0009-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch similarity index 91% rename from debian/patches/patchset-zen/sauce/0010-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch rename to debian/patches/patchset-zen/sauce/0009-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch index 77b5a39..9abdd67 100644 --- a/debian/patches/patchset-zen/sauce/0010-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch +++ b/debian/patches/patchset-zen/sauce/0009-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch @@ -1,4 +1,4 @@ -From 2cceda3c699f19f9c2f287614db2fe5dd009f73a Mon Sep 17 00:00:00 2001 +From 5f93b67c4e2fa81be5cee3edd8ec056407d25f26 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf <sultan@kerneltoast.com> Date: Sun, 19 Apr 2020 19:59:18 -0700 Subject: ZEN: mm: Stop kswapd early when nothing's waiting for it to free @@ -43,14 +43,14 @@ Contains: --- a/mm/internal.h +++ b/mm/internal.h -@@ -739,6 +739,7 @@ extern void post_alloc_hook(struct page +@@ -741,6 +741,7 @@ void post_alloc_hook(struct page *page, extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; +extern atomic_long_t kswapd_waiters; - void free_unref_page(struct page *page, unsigned int order); - void free_unref_folios(struct folio_batch *fbatch); + struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, + nodemask_t *); --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -88,6 +88,8 @@ typedef int __bitwise fpi_t; @@ -102,7 +102,7 @@ Contains: --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -6385,7 +6385,7 @@ retry: +@@ -6382,7 +6382,7 @@ retry: return 0; } @@ -111,7 +111,7 @@ Contains: { struct zone *zone; unsigned long pfmemalloc_reserve = 0; -@@ -6414,6 +6414,10 @@ static bool allow_direct_reclaim(pg_data +@@ -6411,6 +6411,10 @@ static bool allow_direct_reclaim(pg_data wmark_ok = free_pages > pfmemalloc_reserve / 2; @@ -122,7 +122,7 @@ Contains: /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) -@@ -6479,7 +6483,7 @@ static bool throttle_direct_reclaim(gfp_ +@@ -6476,7 +6480,7 @@ static bool throttle_direct_reclaim(gfp_ /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; @@ -131,7 +131,7 @@ Contains: goto out; break; } -@@ -6501,11 +6505,14 @@ static bool throttle_direct_reclaim(gfp_ +@@ -6498,11 +6502,14 @@ static bool throttle_direct_reclaim(gfp_ */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, @@ -148,7 +148,7 @@ Contains: if (fatal_signal_pending(current)) return true; -@@ -7008,14 +7015,14 @@ restart: +@@ -7005,14 +7012,14 @@ restart: * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && diff --git a/debian/patches/patchset-zen/sauce/0023-ZEN-ahci-Disable-staggered-spinup-by-default.patch b/debian/patches/patchset-zen/sauce/0010-ZEN-ahci-Disable-staggered-spinup-by-default.patch similarity index 93% rename from debian/patches/patchset-zen/sauce/0023-ZEN-ahci-Disable-staggered-spinup-by-default.patch rename to debian/patches/patchset-zen/sauce/0010-ZEN-ahci-Disable-staggered-spinup-by-default.patch index b66b8ca..815a8c4 100644 --- a/debian/patches/patchset-zen/sauce/0023-ZEN-ahci-Disable-staggered-spinup-by-default.patch +++ b/debian/patches/patchset-zen/sauce/0010-ZEN-ahci-Disable-staggered-spinup-by-default.patch @@ -1,4 +1,4 @@ -From 1ec451a4bbac7cc00b59f8ca504d6a8898615880 Mon Sep 17 00:00:00 2001 +From 80b06f0f0bba019632e40c11231987a7e996c340 Mon Sep 17 00:00:00 2001 From: EXtremeExploit <pedro.montes.alcalde@gmail.com> Date: Fri, 29 Nov 2024 13:05:27 -0300 Subject: ZEN: ahci: Disable staggered spinup by default diff --git a/debian/patches/patchset-zen/sauce/0024-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.patch b/debian/patches/patchset-zen/sauce/0011-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.patch similarity index 74% rename from debian/patches/patchset-zen/sauce/0024-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.patch rename to debian/patches/patchset-zen/sauce/0011-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.patch index b8492a3..4d6533f 100644 --- a/debian/patches/patchset-zen/sauce/0024-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.patch +++ b/debian/patches/patchset-zen/sauce/0011-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.patch @@ -1,4 +1,4 @@ -From a31b09c511dd58e5032a3c941638207281b20ce4 Mon Sep 17 00:00:00 2001 +From ac35b7af0aac6a9eb996962130a99c9af75c8b08 Mon Sep 17 00:00:00 2001 From: Steven Barrett <steven@liquorix.net> Date: Sat, 14 Dec 2024 11:23:18 -0600 Subject: ZEN: kernel/Kconfig.preempt: Remove EXPERT conditional on PREEMPT_RT @@ -11,12 +11,12 @@ items hidden by enabling CONFIG_EXPERT. --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt -@@ -69,7 +69,7 @@ config PREEMPT +@@ -88,7 +88,7 @@ endchoice config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" -- depends on EXPERT && ARCH_SUPPORTS_RT -+ depends on ARCH_SUPPORTS_RT +- depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST ++ depends on ARCH_SUPPORTS_RT && !COMPILE_TEST select PREEMPTION help This option turns the kernel into a real-time kernel by replacing diff --git a/debian/patches/patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch b/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Base-config-item.patch similarity index 80% rename from debian/patches/patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch rename to debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Base-config-item.patch index fd3c270..e289754 100644 --- a/debian/patches/patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch +++ b/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Base-config-item.patch @@ -1,4 +1,4 @@ -From 530ee9b20cf436bcbb3a632cb19fb5e13a29dde7 Mon Sep 17 00:00:00 2001 +From 8bf253ea1b48fe101dc0161824b9a7d85f420b84 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com> Date: Mon, 27 Jan 2020 18:10:06 +0100 Subject: ZEN: INTERACTIVE: Base config item @@ -9,7 +9,7 @@ Subject: ZEN: INTERACTIVE: Base config item --- a/init/Kconfig +++ b/init/Kconfig -@@ -154,6 +154,12 @@ config THREAD_INFO_IN_TASK +@@ -157,6 +157,12 @@ config THREAD_INFO_IN_TASK menu "General setup" diff --git a/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch b/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch similarity index 82% rename from debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch rename to debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch index f058d3e..ebd6eb7 100644 --- a/debian/patches/patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch +++ b/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch @@ -1,4 +1,4 @@ -From d2f0a5801471b5f67344b2c92a2aa29f1aed626a Mon Sep 17 00:00:00 2001 +From d3b2ab943a1de0838c4bd515dbed45f8f1c3c2cc Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com> Date: Mon, 27 Jan 2020 18:11:05 +0100 Subject: ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices @@ -10,7 +10,7 @@ Subject: ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices --- a/block/elevator.c +++ b/block/elevator.c -@@ -568,7 +568,11 @@ static struct elevator_type *elevator_ge +@@ -560,7 +560,11 @@ static struct elevator_type *elevator_ge !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; @@ -24,7 +24,7 @@ Subject: ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices /* --- a/init/Kconfig +++ b/init/Kconfig -@@ -160,6 +160,10 @@ config ZEN_INTERACTIVE +@@ -163,6 +163,10 @@ config ZEN_INTERACTIVE help Tunes the kernel for responsiveness at the cost of throughput and power usage. diff --git a/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch b/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch similarity index 85% rename from debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch rename to debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch index 1c14af5..72c5442 100644 --- a/debian/patches/patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch +++ b/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch @@ -1,4 +1,4 @@ -From 346251fa257245b3a06e37de863a1dbafbf2bbc2 Mon Sep 17 00:00:00 2001 +From d941bedf16b95646be26364f00cf46c6649608a6 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org> Date: Mon, 12 Dec 2022 00:03:03 +0100 Subject: ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices @@ -10,7 +10,7 @@ Subject: ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices --- a/block/elevator.c +++ b/block/elevator.c -@@ -566,7 +566,13 @@ static struct elevator_type *elevator_ge +@@ -558,7 +558,13 @@ static struct elevator_type *elevator_ge if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags)) @@ -26,7 +26,7 @@ Subject: ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices return elevator_find_get("bfq"); --- a/init/Kconfig +++ b/init/Kconfig -@@ -163,6 +163,7 @@ config ZEN_INTERACTIVE +@@ -166,6 +166,7 @@ config ZEN_INTERACTIVE --- Block Layer ---------------------------------------- Default scheduler for SQ..: mq-deadline -> bfq diff --git a/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch b/debian/patches/patchset-zen/sauce/0015-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch similarity index 92% rename from debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch rename to debian/patches/patchset-zen/sauce/0015-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch index 6cbef40..923ca1f 100644 --- a/debian/patches/patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch +++ b/debian/patches/patchset-zen/sauce/0015-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch @@ -1,4 +1,4 @@ -From 26fcaf58616b8cb3ce042e31c640594ea2fb5987 Mon Sep 17 00:00:00 2001 +From d0ce01e1def080e52770f9a899476bb840807b37 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com> Date: Mon, 27 Jan 2020 18:21:09 +0100 Subject: ZEN: INTERACTIVE: Enable background reclaim of hugepages @@ -32,7 +32,7 @@ Reasoning and details in the original patch: https://lwn.net/Articles/711248/ --- a/init/Kconfig +++ b/init/Kconfig -@@ -165,6 +165,10 @@ config ZEN_INTERACTIVE +@@ -168,6 +168,10 @@ config ZEN_INTERACTIVE Default scheduler for SQ..: mq-deadline -> bfq Default scheduler for MQ..: none -> kyber @@ -45,7 +45,7 @@ Reasoning and details in the original patch: https://lwn.net/Articles/711248/ --- a/mm/huge_memory.c +++ b/mm/huge_memory.c -@@ -65,7 +65,11 @@ unsigned long transparent_hugepage_flags +@@ -64,7 +64,11 @@ unsigned long transparent_hugepage_flags #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif diff --git a/debian/patches/patchset-zen/sauce/0015-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch b/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch similarity index 91% rename from debian/patches/patchset-zen/sauce/0015-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch rename to debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch index 52bb247..9708a96 100644 --- a/debian/patches/patchset-zen/sauce/0015-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch +++ b/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch @@ -1,4 +1,4 @@ -From 9e5b04df7190ab4750ae3c67714fd537ef4d79f5 Mon Sep 17 00:00:00 2001 +From f1fd33efd4b70519ff51b78c62d6fdf7d4f69620 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org> Date: Tue, 31 Oct 2023 19:03:10 +0100 Subject: ZEN: INTERACTIVE: Tune EEVDF for interactivity @@ -42,7 +42,7 @@ caused by rebalancing too many tasks at once. --- a/init/Kconfig +++ b/init/Kconfig -@@ -169,6 +169,13 @@ config ZEN_INTERACTIVE +@@ -172,6 +172,13 @@ config ZEN_INTERACTIVE Background-reclaim hugepages...: no -> yes @@ -58,7 +58,7 @@ caused by rebalancing too many tasks at once. --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c -@@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scalin +@@ -76,10 +76,19 @@ unsigned int sysctl_sched_tunable_scalin * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ @@ -78,7 +78,7 @@ caused by rebalancing too many tasks at once. static int __init setup_sched_thermal_decay_shift(char *str) { -@@ -121,8 +130,12 @@ int __weak arch_asym_cpu_priority(int cp +@@ -124,8 +133,12 @@ int __weak arch_asym_cpu_priority(int cp * * (default: 5 msec, units: microseconds) */ @@ -93,7 +93,7 @@ caused by rebalancing too many tasks at once. /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -2797,7 +2797,7 @@ extern void deactivate_task(struct rq *r +@@ -2837,7 +2837,7 @@ extern void deactivate_task(struct rq *r extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); diff --git a/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch b/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch similarity index 97% rename from debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch rename to debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch index f623dc9..aa1c0c6 100644 --- a/debian/patches/patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch +++ b/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch @@ -1,4 +1,4 @@ -From f654ea11471f81ac7dd68467f552db25722df25e Mon Sep 17 00:00:00 2001 +From 75f2a8831bd24a35d9853b11dabc06a138c5e445 Mon Sep 17 00:00:00 2001 From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com> Date: Mon, 27 Jan 2020 18:27:16 +0100 Subject: ZEN: INTERACTIVE: Tune ondemand governor for interactivity @@ -75,7 +75,7 @@ Remove MuQSS cpufreq configuration. --- a/init/Kconfig +++ b/init/Kconfig -@@ -176,6 +176,12 @@ config ZEN_INTERACTIVE +@@ -179,6 +179,12 @@ config ZEN_INTERACTIVE Bandwidth slice size...........: 5 -> 3 ms Task rebalancing threshold.....: 32 -> 8 diff --git a/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch b/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch similarity index 85% rename from debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch rename to debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch index fa5f14a..12419dd 100644 --- a/debian/patches/patchset-zen/sauce/0017-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch +++ b/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch @@ -1,4 +1,4 @@ -From f138e9762fd03612db5593f4c267c8f8b5799159 Mon Sep 17 00:00:00 2001 +From b82d80a4195f179b9c0d0c80f662a7f42ed21ce8 Mon Sep 17 00:00:00 2001 From: Steven Barrett <steven@liquorix.net> Date: Sat, 5 Mar 2022 11:37:14 -0600 Subject: ZEN: INTERACTIVE: mm: Disable unevictable compaction @@ -12,7 +12,7 @@ turn it off when CONFIG_ZEN_INTERACTIVE is set as well. --- a/init/Kconfig +++ b/init/Kconfig -@@ -168,6 +168,7 @@ config ZEN_INTERACTIVE +@@ -171,6 +171,7 @@ config ZEN_INTERACTIVE --- Virtual Memory Subsystem --------------------------- Background-reclaim hugepages...: no -> yes @@ -22,7 +22,7 @@ turn it off when CONFIG_ZEN_INTERACTIVE is set as well. --- a/mm/Kconfig +++ b/mm/Kconfig -@@ -648,7 +648,7 @@ config COMPACTION +@@ -691,7 +691,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION diff --git a/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch b/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch similarity index 95% rename from debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch rename to debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch index 35669b3..ea480ea 100644 --- a/debian/patches/patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch +++ b/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch @@ -1,4 +1,4 @@ -From 76960c3806e7dfb618f49677cc84dafbfe48e4c4 Mon Sep 17 00:00:00 2001 +From 7227af3e01f9ae5a2bcdc9aa652c973438938eb3 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf <sultan@kerneltoast.com> Date: Sat, 28 Mar 2020 13:06:28 -0700 Subject: ZEN: INTERACTIVE: mm: Disable watermark boosting by default @@ -33,7 +33,7 @@ Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com> --- a/init/Kconfig +++ b/init/Kconfig -@@ -169,6 +169,7 @@ config ZEN_INTERACTIVE +@@ -172,6 +172,7 @@ config ZEN_INTERACTIVE Background-reclaim hugepages...: no -> yes Compact unevictable............: yes -> no diff --git a/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch b/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch similarity index 95% rename from debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch rename to debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch index 1751479..db4d266 100644 --- a/debian/patches/patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch +++ b/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch @@ -1,4 +1,4 @@ -From fc3e794cecb686d4e05c6ed86fdf9b2dbd725ea9 Mon Sep 17 00:00:00 2001 +From 91187cefc66b9c186a78d7bd996088fc74c66c99 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf <sultan@kerneltoast.com> Date: Wed, 20 Oct 2021 20:50:11 -0700 Subject: ZEN: INTERACTIVE: mm: Lower the non-hugetlbpage pageblock size to @@ -47,7 +47,7 @@ Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com> --- a/init/Kconfig +++ b/init/Kconfig -@@ -170,6 +170,7 @@ config ZEN_INTERACTIVE +@@ -173,6 +173,7 @@ config ZEN_INTERACTIVE Background-reclaim hugepages...: no -> yes Compact unevictable............: yes -> no Watermark boost factor.........: 1.5 -> 0 diff --git a/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch b/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch similarity index 88% rename from debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch rename to debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch index 9f0b5dc..5f77548 100644 --- a/debian/patches/patchset-zen/sauce/0020-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch +++ b/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch @@ -1,4 +1,4 @@ -From be57a2710aef65116767d26930dd1251ff6e060f Mon Sep 17 00:00:00 2001 +From 779648709dc797dac595e3007b4c7c3fee254537 Mon Sep 17 00:00:00 2001 From: Steven Barrett <steven@liquorix.net> Date: Sat, 21 May 2022 15:15:09 -0500 Subject: ZEN: INTERACTIVE: dm-crypt: Disable workqueues for crypto ops @@ -20,7 +20,7 @@ Fixes: https://github.com/zen-kernel/zen-kernel/issues/282 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c -@@ -3308,6 +3308,11 @@ static int crypt_ctr(struct dm_target *t +@@ -3305,6 +3305,11 @@ static int crypt_ctr(struct dm_target *t goto bad; } @@ -34,7 +34,7 @@ Fixes: https://github.com/zen-kernel/zen-kernel/issues/282 goto bad; --- a/init/Kconfig +++ b/init/Kconfig -@@ -164,6 +164,7 @@ config ZEN_INTERACTIVE +@@ -167,6 +167,7 @@ config ZEN_INTERACTIVE Default scheduler for SQ..: mq-deadline -> bfq Default scheduler for MQ..: none -> kyber diff --git a/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch b/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch similarity index 86% rename from debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch rename to debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch index 0a5bab6..47ac21c 100644 --- a/debian/patches/patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch +++ b/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch @@ -1,4 +1,4 @@ -From 41fe25c2e4e89c6afd35e3feb720e5a6797857d3 Mon Sep 17 00:00:00 2001 +From ef87b1cb12134c34eed834315b03c4a6747b5716 Mon Sep 17 00:00:00 2001 From: Steven Barrett <steven@liquorix.net> Date: Mon, 5 Sep 2022 11:35:20 -0500 Subject: ZEN: INTERACTIVE: mm/swap: Disable swap-in readahead @@ -20,7 +20,7 @@ same change so Zen Kernel users benefit. --- a/init/Kconfig +++ b/init/Kconfig -@@ -172,6 +172,7 @@ config ZEN_INTERACTIVE +@@ -175,6 +175,7 @@ config ZEN_INTERACTIVE Compact unevictable............: yes -> no Watermark boost factor.........: 1.5 -> 0 Pageblock order................: 10 -> 3 @@ -30,7 +30,7 @@ same change so Zen Kernel users benefit. --- a/mm/swap.c +++ b/mm/swap.c -@@ -1080,6 +1080,10 @@ void folio_batch_remove_exceptionals(str +@@ -1081,6 +1081,10 @@ void folio_batch_remove_exceptionals(str */ void __init swap_setup(void) { @@ -41,7 +41,7 @@ same change so Zen Kernel users benefit. unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ -@@ -1091,4 +1095,5 @@ void __init swap_setup(void) +@@ -1092,4 +1096,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ diff --git a/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch b/debian/patches/patchset-zen/sauce/0023-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch similarity index 85% rename from debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch rename to debian/patches/patchset-zen/sauce/0023-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch index 5ac963c..bc9a752 100644 --- a/debian/patches/patchset-zen/sauce/0022-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch +++ b/debian/patches/patchset-zen/sauce/0023-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch @@ -1,4 +1,4 @@ -From 40de9c08129e2d8e182a166df2f1e823f70fa31d Mon Sep 17 00:00:00 2001 +From cb33a6dc022faa07ac1e1cd544567b28a7e9afeb Mon Sep 17 00:00:00 2001 From: Steven Barrett <steven@liquorix.net> Date: Sun, 19 Sep 2021 16:03:36 -0500 Subject: ZEN: INTERACTIVE: Document PDS/BMQ configuration @@ -9,7 +9,7 @@ Subject: ZEN: INTERACTIVE: Document PDS/BMQ configuration --- a/init/Kconfig +++ b/init/Kconfig -@@ -181,6 +181,11 @@ config ZEN_INTERACTIVE +@@ -184,6 +184,11 @@ config ZEN_INTERACTIVE Bandwidth slice size...........: 5 -> 3 ms Task rebalancing threshold.....: 32 -> 8 diff --git a/debian/patches/patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch b/debian/patches/patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch deleted file mode 100644 index b7bf94e..0000000 --- a/debian/patches/patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch +++ /dev/null @@ -1,194 +0,0 @@ -From eacae6d88bcc8a925124f97b7788bb2bfac8b267 Mon Sep 17 00:00:00 2001 -From: Vinay Banakar <vny@google.com> -Date: Mon, 20 Jan 2025 16:47:29 -0600 -Subject: mm: Optimize TLB flushes during page reclaim - -The current implementation in shrink_folio_list() performs full TLB -flushes and issues IPIs for each individual page being reclaimed. This -causes unnecessary overhead during memory reclaim, whether triggered -by madvise(MADV_PAGEOUT) or kswapd, especially in scenarios where -applications are actively moving cold pages to swap while maintaining -high performance requirements for hot pages. - -The current code: -1. Clears PTE and unmaps each page individually -2. Performs a full TLB flush on all cores using the VMA (via CR3 write) or -issues individual TLB shootdowns (invlpg+invlpcid) for single-core usage -3. Submits each page individually to BIO - -This approach results in: -- Excessive full TLB flushes across all cores -- Unnecessary IPI storms when processing multiple pages -- Suboptimal I/O submission patterns - -I initially tried using selective TLB shootdowns (invlpg) instead of -full TLB flushes per each page to avoid interference with other -threads. However, this approach still required sending IPIs to all -cores for each page, which did not significantly improve application -throughput. - -This patch instead optimizes the process by batching operations, -issuing one IPI per PMD instead of per page. This reduces interrupts -by a factor of 512 and enables batching page submissions to BIO. The -new approach: -1. Collect dirty pages that need to be written back -2. Issue a single TLB flush for all dirty pages in the batch -3. Process the collected pages for writebacks (submit to BIO) - -Testing shows significant reduction in application throughput impact -during page-out operations. Applications maintain better performance -during memory reclaim, when triggered by explicit -madvise(MADV_PAGEOUT) calls. - -I'd appreciate your feedback on this approach, especially on the -correctness of batched BIO submissions. Looking forward to your -comments. - -Signed-off-by: Vinay Banakar <vny@google.com> ---- - mm/vmscan.c | 120 ++++++++++++++++++++++++++++++++-------------------- - 1 file changed, 74 insertions(+), 46 deletions(-) - ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -1053,6 +1053,7 @@ static unsigned int shrink_folio_list(st - struct folio_batch free_folios; - LIST_HEAD(ret_folios); - LIST_HEAD(demote_folios); -+ LIST_HEAD(pageout_list); - unsigned int nr_reclaimed = 0, nr_demoted = 0; - unsigned int pgactivate = 0; - bool do_demote_pass; -@@ -1365,52 +1366,9 @@ retry: - if (!sc->may_writepage) - goto keep_locked; - -- /* -- * Folio is dirty. Flush the TLB if a writable entry -- * potentially exists to avoid CPU writes after I/O -- * starts and then write it out here. -- */ -- try_to_unmap_flush_dirty(); -- switch (pageout(folio, mapping, &plug, folio_list)) { -- case PAGE_KEEP: -- goto keep_locked; -- case PAGE_ACTIVATE: -- /* -- * If shmem folio is split when writeback to swap, -- * the tail pages will make their own pass through -- * this function and be accounted then. -- */ -- if (nr_pages > 1 && !folio_test_large(folio)) { -- sc->nr_scanned -= (nr_pages - 1); -- nr_pages = 1; -- } -- goto activate_locked; -- case PAGE_SUCCESS: -- if (nr_pages > 1 && !folio_test_large(folio)) { -- sc->nr_scanned -= (nr_pages - 1); -- nr_pages = 1; -- } -- stat->nr_pageout += nr_pages; -- -- if (folio_test_writeback(folio)) -- goto keep; -- if (folio_test_dirty(folio)) -- goto keep; -- -- /* -- * A synchronous write - probably a ramdisk. Go -- * ahead and try to reclaim the folio. -- */ -- if (!folio_trylock(folio)) -- goto keep; -- if (folio_test_dirty(folio) || -- folio_test_writeback(folio)) -- goto keep_locked; -- mapping = folio_mapping(folio); -- fallthrough; -- case PAGE_CLEAN: -- ; /* try to free the folio below */ -- } -+ /* Add to pageout list for defered bio submissions */ -+ list_add(&folio->lru, &pageout_list); -+ continue; - } - - /* -@@ -1521,6 +1479,76 @@ keep: - } - /* 'folio_list' is always empty here */ - -+ if (!list_empty(&pageout_list)) { -+ /* -+ * Batch TLB flushes by flushing once before processing all dirty pages. -+ * Since we operate on one PMD at a time, this batches TLB flushes at -+ * PMD granularity rather than per-page, reducing IPIs. -+ */ -+ struct address_space *mapping; -+ try_to_unmap_flush_dirty(); -+ -+ while (!list_empty(&pageout_list)) { -+ struct folio *folio = lru_to_folio(&pageout_list); -+ list_del(&folio->lru); -+ -+ /* Recheck if page got reactivated */ -+ if (folio_test_active(folio) || -+ (folio_mapped(folio) && folio_test_young(folio))) -+ goto skip_pageout_locked; -+ -+ mapping = folio_mapping(folio); -+ pageout_t pageout_res = pageout(folio, mapping, &plug, &pageout_list); -+ switch (pageout_res) { -+ case PAGE_KEEP: -+ goto skip_pageout_locked; -+ case PAGE_ACTIVATE: -+ goto skip_pageout_locked; -+ case PAGE_SUCCESS: -+ stat->nr_pageout += folio_nr_pages(folio); -+ -+ if (folio_test_writeback(folio) || -+ folio_test_dirty(folio)) -+ goto skip_pageout; -+ -+ /* -+ * A synchronous write - probably a ramdisk. Go -+ * ahead and try to reclaim the folio. -+ */ -+ if (!folio_trylock(folio)) -+ goto skip_pageout; -+ if (folio_test_dirty(folio) || -+ folio_test_writeback(folio)) -+ goto skip_pageout_locked; -+ -+ // Try to free the page -+ if (!mapping || -+ !__remove_mapping(mapping, folio, true, -+ sc->target_mem_cgroup)) -+ goto skip_pageout_locked; -+ -+ nr_reclaimed += folio_nr_pages(folio); -+ folio_unlock(folio); -+ continue; -+ -+ case PAGE_CLEAN: -+ if (!mapping || -+ !__remove_mapping(mapping, folio, true, -+ sc->target_mem_cgroup)) -+ goto skip_pageout_locked; -+ -+ nr_reclaimed += folio_nr_pages(folio); -+ folio_unlock(folio); -+ continue; -+ } -+ -+skip_pageout_locked: -+ folio_unlock(folio); -+skip_pageout: -+ list_add(&folio->lru, &ret_folios); -+ } -+ } -+ - /* Migrate folios selected for demotion */ - nr_demoted = demote_folio_list(&demote_folios, pgdat); - nr_reclaimed += nr_demoted; diff --git a/debian/patches/series b/debian/patches/series index 979c882..8cfaa85 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -69,16 +69,10 @@ features/x86/x86-make-x32-syscall-support-conditional.patch bugfix/all/disable-some-marvell-phys.patch bugfix/all/fs-add-module_softdep-declarations-for-hard-coded-cr.patch bugfix/all/documentation-use-relative-source-paths-in-abi-documentation.patch -bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch -bugfix/all/drm-amdkfd-Fix-user-queue-validation-on-Gfx7-8.patch # Miscellaneous features # Lockdown missing pieces -features/all/lockdown/efi-add-an-efi_secure_boot-flag-to-indicate-secure-b.patch -features/all/lockdown/efi-lock-down-the-kernel-if-booted-in-secure-boot-mo.patch -features/all/lockdown/mtd-disable-slram-and-phram-when-locked-down.patch -features/all/lockdown/arm64-add-kernel-config-option-to-lock-down-when.patch # Improve integrity platform keyring for kernel modules verification features/all/db-mok-keyring/0003-MODSIGN-checking-the-blacklisted-hash-before-loading-a-kernel-module.patch @@ -94,7 +88,6 @@ bugfix/all/module-disable-matching-missing-version-crc.patch bugfix/all/usbip-document-tcp-wrappers.patch bugfix/all/kbuild-fix-recordmcount-dependency.patch bugfix/all/tools-perf-remove-shebangs.patch -bugfix/x86/revert-perf-build-fix-libunwind-feature-detection-on.patch bugfix/all/tools-build-remove-bpf-run-time-check-at-build-time.patch bugfix/all/cpupower-fix-checks-for-cpu-existence.patch bugfix/all/libapi-define-_fortify_source-as-2-not-empty.patch @@ -123,76 +116,79 @@ mixed-arch/0003-krd-adjust-CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3.patch mixed-arch/0004-XANMOD-x86-build-Prevent-generating-avx2-and-avx512-.patch mixed-arch/0005-krd-adjust-KBUILD_CFLAGS-fno-tree-vectorize.patch mixed-arch/0006-XANMOD-kbuild-Add-GCC-SMS-based-modulo-scheduling-fl.patch -mixed-arch/0007-PF-kbuild-6.12-adopt-proposed-upstream-change-for-gener.patch - -misc-bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch -misc-bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch -misc-bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch -misc-bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch -misc-bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch -misc-bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch -misc-bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch -misc-bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch -misc-bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch -misc-bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch -misc-bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch -misc-bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch -misc-bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch -misc-bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch -misc-bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch -misc-bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch -misc-bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch -misc-bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch -misc-bbr3/0019-x86-cfi-bpf-Add-tso_segs-and-skb_marked_lost-to-bpf_.patch - -misc-ntsync7/0001-ntsync-Return-the-fd-from-NTSYNC_IOC_CREATE_SEM.patch -misc-ntsync7/0002-ntsync-Rename-NTSYNC_IOC_SEM_POST-to-NTSYNC_IOC_SEM_.patch -misc-ntsync7/0003-ntsync-Introduce-NTSYNC_IOC_WAIT_ANY.patch -misc-ntsync7/0004-ntsync-Introduce-NTSYNC_IOC_WAIT_ALL.patch -misc-ntsync7/0005-ntsync-Introduce-NTSYNC_IOC_CREATE_MUTEX.patch -misc-ntsync7/0006-ntsync-Introduce-NTSYNC_IOC_MUTEX_UNLOCK.patch -misc-ntsync7/0007-ntsync-Introduce-NTSYNC_IOC_MUTEX_KILL.patch -misc-ntsync7/0008-ntsync-Introduce-NTSYNC_IOC_CREATE_EVENT.patch -misc-ntsync7/0009-ntsync-Introduce-NTSYNC_IOC_EVENT_SET.patch -misc-ntsync7/0010-ntsync-Introduce-NTSYNC_IOC_EVENT_RESET.patch -misc-ntsync7/0011-ntsync-Introduce-NTSYNC_IOC_EVENT_PULSE.patch -misc-ntsync7/0012-ntsync-Introduce-NTSYNC_IOC_SEM_READ.patch -misc-ntsync7/0013-ntsync-Introduce-NTSYNC_IOC_MUTEX_READ.patch -misc-ntsync7/0014-ntsync-Introduce-NTSYNC_IOC_EVENT_READ.patch -misc-ntsync7/0015-ntsync-Introduce-alertable-waits.patch -misc-ntsync7/0016-maintainers-Add-an-entry-for-ntsync.patch -misc-ntsync7/0017-docs-ntsync-Add-documentation-for-the-ntsync-uAPI.patch -misc-ntsync7/0018-ntsync-No-longer-depend-on-BROKEN.patch -misc-ntsync7/0019-ntsync-Set-the-permissions-to-be-0666.patch misc-openwrt/0001-mac80211-ignore-AP-power-level-when-tx-power-type-is.patch -patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch -patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch +patchset-pf/amd-pstate/0001-cpufreq-amd-pstate-Modify-the-min_perf-calculation-i.patch +patchset-pf/amd-pstate/0002-cpufreq-amd-pstate-Remove-the-redundant-des_perf-cla.patch +patchset-pf/amd-pstate/0003-cpufreq-amd-pstate-Pass-min-max_limit_perf-as-min-ma.patch +patchset-pf/amd-pstate/0004-cpufreq-amd-pstate-Convert-all-perf-values-to-u8.patch +patchset-pf/amd-pstate/0005-cpufreq-amd-pstate-Modularize-perf-freq-conversion.patch +patchset-pf/amd-pstate/0006-cpufreq-amd-pstate-Remove-the-unnecessary-cpufreq_up.patch +patchset-pf/amd-pstate/0007-cpufreq-amd-pstate-Add-missing-NULL-ptr-check-in-amd.patch +patchset-pf/amd-pstate/0008-cpufreq-amd-pstate-Use-scope-based-cleanup-for-cpufr.patch +patchset-pf/amd-pstate/0009-cpufreq-amd-pstate-Remove-the-unncecessary-driver_lo.patch +patchset-pf/amd-pstate/0010-cpufreq-amd-pstate-Fix-the-clamping-of-perf-values.patch +patchset-pf/amd-pstate/0011-cpufreq-amd-pstate-Invalidate-cppc_req_cached-during.patch +patchset-pf/amd-pstate/0012-cpufreq-amd-pstate-Show-a-warning-when-a-CPU-fails-t.patch +patchset-pf/amd-pstate/0013-cpufreq-amd-pstate-Drop-min-and-max-cached-frequenci.patch +patchset-pf/amd-pstate/0014-cpufreq-amd-pstate-Move-perf-values-into-a-union.patch +patchset-pf/amd-pstate/0015-cpufreq-amd-pstate-Overhaul-locking.patch +patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Drop-cppc_cap1_cached.patch +patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-ut-Use-_free-macro-to-free-put-po.patch +patchset-pf/amd-pstate/0018-cpufreq-amd-pstate-ut-Allow-lowest-nonlinear-and-low.patch +patchset-pf/amd-pstate/0019-cpufreq-amd-pstate-ut-Drop-SUCCESS-and-FAIL-enums.patch +patchset-pf/amd-pstate/0020-cpufreq-amd-pstate-ut-Run-on-all-of-the-correct-CPUs.patch +patchset-pf/amd-pstate/0021-cpufreq-amd-pstate-ut-Adjust-variable-scope.patch +patchset-pf/amd-pstate/0022-cpufreq-amd-pstate-Replace-all-AMD_CPPC_-macros-with.patch +patchset-pf/amd-pstate/0023-cpufreq-amd-pstate-Cache-CPPC-request-in-shared-mem-.patch +patchset-pf/amd-pstate/0024-cpufreq-amd-pstate-Move-all-EPP-tracing-into-_update.patch +patchset-pf/amd-pstate/0025-cpufreq-amd-pstate-Update-cppc_req_cached-for-shared.patch +patchset-pf/amd-pstate/0026-cpufreq-amd-pstate-Drop-debug-statements-for-policy-.patch +patchset-pf/amd-pstate/0027-cpufreq-amd-pstate-Rework-CPPC-enabling.patch +patchset-pf/amd-pstate/0028-cpufreq-amd-pstate-Stop-caching-EPP.patch +patchset-pf/amd-pstate/0029-cpufreq-amd-pstate-Drop-actions-in-amd_pstate_epp_cp.patch +patchset-pf/amd-pstate/0030-cpufreq-amd-pstate-fix-warning-noticed-by-kernel-tes.patch -patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch -patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch -patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch +patchset-pf/cpuidle/0001-cpuidle-Prefer-teo-over-menu-governor.patch -patchset-pf/pksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch +patchset-pf/crypto/0001-crypto-x86-aes-xts-make-the-fast-path-64-bit-specifi.patch +patchset-pf/crypto/0002-crypto-x86-aes-ctr-rewrite-AESNI-AVX-optimized-CTR-a.patch -patchset-pf/xfs/0001-xfs-fix-chown-with-rt-quota.patch - -patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch +patchset-pf/zstd/0001-zstd-import-upstream-v1.5.7.patch patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch -patchset-xanmod/amd/0001-platform-x86-amd-amd_3d_vcache-Add-AMD-3D-V-Cache-op.patch -patchset-xanmod/amd/0002-platform-x86-amd-amd_3d_vcache-Add-sysfs-ABI-documen.patch - patchset-xanmod/binder/0001-binder-turn-into-module.patch +patchset-xanmod/binder/0002-binder-turn-into-module-list_lru_add-list_lru_del.patch +patchset-xanmod/binder/0003-binder-turn-into-module-lock_vma_under_rcu.patch patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch patchset-xanmod/clearlinux/0002-firmware-Enable-stateless-firmware-loading.patch patchset-xanmod/clearlinux/0003-locking-rwsem-spin-faster.patch +patchset-xanmod/clearlinux/0004-drivers-initialize-ata-before-graphics.patch patchset-xanmod/net/netfilter/0001-netfilter-Add-netfilter-nf_tables-fullcone-support.patch patchset-xanmod/net/netfilter/0002-netfilter-add-xt_FLOWOFFLOAD-target.patch +patchset-xanmod/net/tcp/bbr3/0001-net-tcp_bbr-broaden-app-limited-rate-sample-detectio.patch +patchset-xanmod/net/tcp/bbr3/0002-net-tcp_bbr-v2-shrink-delivered_mstamp-first_tx_msta.patch +patchset-xanmod/net/tcp/bbr3/0003-net-tcp_bbr-v2-snapshot-packets-in-flight-at-transmi.patch +patchset-xanmod/net/tcp/bbr3/0004-net-tcp_bbr-v2-count-packets-lost-over-TCP-rate-samp.patch +patchset-xanmod/net/tcp/bbr3/0005-net-tcp_bbr-v2-export-FLAG_ECE-in-rate_sample.is_ece.patch +patchset-xanmod/net/tcp/bbr3/0006-net-tcp_bbr-v2-introduce-ca_ops-skb_marked_lost-CC-m.patch +patchset-xanmod/net/tcp/bbr3/0007-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-merge-in.patch +patchset-xanmod/net/tcp/bbr3/0008-net-tcp_bbr-v2-adjust-skb-tx.in_flight-upon-split-in.patch +patchset-xanmod/net/tcp/bbr3/0009-net-tcp-add-new-ca-opts-flag-TCP_CONG_WANTS_CE_EVENT.patch +patchset-xanmod/net/tcp/bbr3/0010-net-tcp-re-generalize-TSO-sizing-in-TCP-CC-module-AP.patch +patchset-xanmod/net/tcp/bbr3/0011-net-tcp-add-fast_ack_mode-1-skip-rwin-check-in-tcp_f.patch +patchset-xanmod/net/tcp/bbr3/0012-net-tcp_bbr-v2-record-app-limited-status-of-TLP-repa.patch +patchset-xanmod/net/tcp/bbr3/0013-net-tcp_bbr-v2-inform-CC-module-of-losses-repaired-b.patch +patchset-xanmod/net/tcp/bbr3/0014-net-tcp_bbr-v2-introduce-is_acking_tlp_retrans_seq-i.patch +patchset-xanmod/net/tcp/bbr3/0015-tcp-introduce-per-route-feature-RTAX_FEATURE_ECN_LOW.patch +patchset-xanmod/net/tcp/bbr3/0016-net-tcp_bbr-v3-update-TCP-bbr-congestion-control-mod.patch +patchset-xanmod/net/tcp/bbr3/0017-net-tcp_bbr-v3-ensure-ECN-enabled-BBR-flows-set-ECT-.patch +patchset-xanmod/net/tcp/bbr3/0018-tcp-export-TCPI_OPT_ECN_LOW-in-tcp_info-tcpi_options.patch + patchset-xanmod/net/tcp/cloudflare/0001-tcp-Add-a-sysctl-to-skip-tcp-collapse-processing-whe.patch patchset-xanmod/pci_acso/0001-PCI-Enable-overrides-for-missing-ACS-capabilities.patch @@ -204,67 +200,68 @@ patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch -patchset-zen/invlpgb-v9/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch -patchset-zen/invlpgb-v9/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch -patchset-zen/invlpgb-v9/0003-x86-mm-consolidate-full-flush-threshold-decision.patch -patchset-zen/invlpgb-v9/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch -patchset-zen/invlpgb-v9/0005-x86-mm-add-INVLPGB-support-code.patch -patchset-zen/invlpgb-v9/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch -patchset-zen/invlpgb-v9/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch -patchset-zen/invlpgb-v9/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch -patchset-zen/invlpgb-v9/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch -patchset-zen/invlpgb-v9/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch -patchset-zen/invlpgb-v9/0011-x86-mm-enable-AMD-translation-cache-extensions.patch -patchset-zen/invlpgb-v9/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch -patchset-zen/invlpgb-v9/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch -patchset-zen/invlpgb-v9/0014-vdso-Introduce-vdso-page.h.patch -patchset-zen/invlpgb-v9/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch +patchset-zen/invlpgb/0001-x86-mm-Make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch +patchset-zen/invlpgb/0002-x86-mm-Remove-pv_ops.mmu.tlb_remove_table-call.patch +patchset-zen/invlpgb/0003-x86-mm-Consolidate-full-flush-threshold-decision.patch +patchset-zen/invlpgb/0004-x86-mm-Add-INVLPGB-feature-and-Kconfig-entry.patch +patchset-zen/invlpgb/0005-x86-mm-Add-INVLPGB-support-code.patch +patchset-zen/invlpgb/0006-x86-mm-Use-INVLPGB-for-kernel-TLB-flushes.patch +patchset-zen/invlpgb/0007-x86-mm-Use-broadcast-TLB-flushing-in-page-reclaim.patch +patchset-zen/invlpgb/0008-x86-mm-Add-global-ASID-allocation-helper-functions.patch +patchset-zen/invlpgb/0009-x86-mm-Handle-global-ASID-context-switch-and-TLB-flu.patch +patchset-zen/invlpgb/0010-x86-mm-Add-global-ASID-process-exit-helpers.patch +patchset-zen/invlpgb/0011-x86-mm-Enable-broadcast-TLB-invalidation-for-multi-t.patch +patchset-zen/invlpgb/0012-x86-mm-Enable-AMD-translation-cache-extensions.patch +patchset-zen/invlpgb/0013-x86-mm-Always-set-the-ASID-valid-bit-for-the-INVLPGB.patch +patchset-zen/invlpgb/0014-x86-mm-Only-do-broadcast-flush-from-reclaim-if-pages.patch -patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch +patchset-zen/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch -patchset-xanmod/xanmod/0001-kbuild-Remove-GCC-minimal-function-alignment.patch -patchset-xanmod/xanmod/0002-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch -patchset-xanmod/xanmod/0003-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch -patchset-xanmod/xanmod/0004-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch -patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch -patchset-xanmod/xanmod/0006-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch -patchset-xanmod/xanmod/0007-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch -patchset-xanmod/xanmod/0008-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch -patchset-xanmod/xanmod/0009-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch -patchset-xanmod/xanmod/0010-XANMOD-mm-Raise-max_map_count-default-value.patch -patchset-xanmod/xanmod/0011-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch -patchset-xanmod/xanmod/0012-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch -patchset-xanmod/xanmod/0013-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch -patchset-xanmod/xanmod/0014-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch -patchset-xanmod/xanmod/0015-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch -patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch +patchset-xanmod/xanmod/0001-kbuild-Re-add-.config-file-required-to-sign-external.patch +patchset-xanmod/xanmod/0002-kbuild-Remove-GCC-minimal-function-alignment.patch +patchset-xanmod/xanmod/0003-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch +patchset-xanmod/xanmod/0004-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch +patchset-xanmod/xanmod/0005-XANMOD-block-mq-deadline-Increase-write-priority-to-.patch +patchset-xanmod/xanmod/0006-XANMOD-block-mq-deadline-Disable-front_merges-by-def.patch +patchset-xanmod/xanmod/0007-XANMOD-block-Set-rq_affinity-to-force-complete-I-O-r.patch +patchset-xanmod/xanmod/0008-XANMOD-blk-wbt-Set-wbt_default_latency_nsec-to-2msec.patch +patchset-xanmod/xanmod/0009-XANMOD-kconfig-add-500Hz-timer-interrupt-kernel-conf.patch +patchset-xanmod/xanmod/0010-XANMOD-dcache-cache_pressure-50-decreases-the-rate-a.patch +patchset-xanmod/xanmod/0011-XANMOD-mm-Raise-max_map_count-default-value.patch +patchset-xanmod/xanmod/0012-XANMOD-mm-vmscan-Set-minimum-amount-of-swapping.patch +patchset-xanmod/xanmod/0013-XANMOD-sched-autogroup-Add-kernel-parameter-and-conf.patch +patchset-xanmod/xanmod/0014-XANMOD-cpufreq-tunes-ondemand-and-conservative-gover.patch +patchset-xanmod/xanmod/0015-XANMOD-lib-kconfig.debug-disable-default-SYMBOLIC_ER.patch +patchset-xanmod/xanmod/0016-XANMOD-scripts-setlocalversion-remove-tag-for-git-re.patch +patchset-xanmod/xanmod/0017-XANMOD-scripts-setlocalversion-Move-localversion-fil.patch patchset-zen/sauce/0001-ZEN-Add-VHBA-driver.patch -patchset-zen/sauce/0002-vhba-Fix-compat-with-kernel-6.11.patch +patchset-zen/sauce/0002-VHBA-fix-building-with-kernel-6.14-rc1.patch patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch patchset-zen/sauce/0004-ZEN-Disable-stack-conservation-for-GCC.patch -patchset-zen/sauce/0005-ZEN-Initialize-ata-before-graphics.patch -patchset-zen/sauce/0006-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch -patchset-zen/sauce/0007-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch -patchset-zen/sauce/0008-ZEN-intel-pstate-Implement-enable-parameter.patch -patchset-zen/sauce/0009-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch -patchset-zen/sauce/0010-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch -patchset-zen/sauce/0011-ZEN-INTERACTIVE-Base-config-item.patch -patchset-zen/sauce/0012-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch -patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch -patchset-zen/sauce/0014-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch -patchset-zen/sauce/0015-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch -patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch -patchset-zen/sauce/0017-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch -patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch -patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch -patchset-zen/sauce/0020-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch -patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch -patchset-zen/sauce/0022-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch -patchset-zen/sauce/0023-ZEN-ahci-Disable-staggered-spinup-by-default.patch -patchset-zen/sauce/0024-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.patch +patchset-zen/sauce/0005-ZEN-Input-evdev-use-call_rcu-when-detaching-client.patch +patchset-zen/sauce/0006-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch +patchset-zen/sauce/0007-ZEN-intel-pstate-Implement-enable-parameter.patch +patchset-zen/sauce/0008-ZEN-drm-amdgpu-pm-Allow-override-of-min_power_limit-.patch +patchset-zen/sauce/0009-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch +patchset-zen/sauce/0010-ZEN-ahci-Disable-staggered-spinup-by-default.patch +patchset-zen/sauce/0011-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.patch +patchset-zen/sauce/0012-ZEN-INTERACTIVE-Base-config-item.patch +patchset-zen/sauce/0013-ZEN-INTERACTIVE-Use-BFQ-as-the-elevator-for-SQ-devic.patch +patchset-zen/sauce/0014-ZEN-INTERACTIVE-Use-Kyber-as-the-elevator-for-MQ-dev.patch +patchset-zen/sauce/0015-ZEN-INTERACTIVE-Enable-background-reclaim-of-hugepag.patch +patchset-zen/sauce/0016-ZEN-INTERACTIVE-Tune-EEVDF-for-interactivity.patch +patchset-zen/sauce/0017-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch +patchset-zen/sauce/0018-ZEN-INTERACTIVE-mm-Disable-unevictable-compaction.patch +patchset-zen/sauce/0019-ZEN-INTERACTIVE-mm-Disable-watermark-boosting-by-def.patch +patchset-zen/sauce/0020-ZEN-INTERACTIVE-mm-Lower-the-non-hugetlbpage-pageblo.patch +patchset-zen/sauce/0021-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch +patchset-zen/sauce/0022-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch +patchset-zen/sauce/0023-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch -patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch -patchset-pf/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch +patchset-pf/fixes/0001-tpm-do-not-start-chip-while-suspended.patch +patchset-pf/fixes/0002-x86-insn_decoder_test-allow-longer-symbol-names.patch -patchset-zen/fixes/0001-futex-improve-user-space-accesses.patch +patchset-zen/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch +patchset-zen/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch +patchset-zen/fixes/0003-EDAC-igen6-Fix-the-flood-of-invalid-error-reports.patch