From 3fb1083e44f2dc696e388764349d0d92b4b70fd9 Mon Sep 17 00:00:00 2001 From: Konstantin Demin Date: Thu, 2 Jan 2025 21:18:06 +0300 Subject: [PATCH] release 6.12.8 --- debian/bin/genpatch-pfkernel | 2 +- debian/changelog | 7 + debian/config/amd64/config.cloud | 7 - debian/config/amd64/config.mobile | 4 - debian/config/amd64/config.vm | 7 - debian/config/config | 11 +- ...egacy-client-tracking-initialization.patch | 33 ++ ...nprivileged-CLONE_NEWUSER-by-default.patch | 4 +- ..._GATHER_RCU_TABLE_FREE-unconditional.patch | 60 +++ ...ove-pv_ops.mmu.tlb_remove_table-call.patch | 137 +++++ ...m-add-X86_FEATURE_INVLPGB-definition.patch | 23 + ...-mm-get-INVLPGB-count-max-from-CPUID.patch | 57 ++ ...0005-x86-mm-add-INVLPGB-support-code.patch | 121 +++++ ...m-use-INVLPGB-for-kernel-TLB-flushes.patch | 61 +++ ...x86-tlb-use-INVLPGB-in-flush_tlb_all.patch | 28 + ...cast-TLB-flushing-for-page-reclaim-T.patch | 36 ++ ...oadcast-TLB-invalidation-for-multi-t.patch | 508 ++++++++++++++++++ ...ted-broadcast-flushing-from-tlbbatch.patch | 126 +++++ ...ble-AMD-translation-cache-extensions.patch | 82 +++ ...lidate-final-translations-with-INVLP.patch | 28 + ...e-unnecessary-calls-to-lru_add_drain.patch | 92 ++++ ...d-missing-statement-in-resume_phase3.patch | 27 + debian/patches/series | 16 + 23 files changed, 1455 insertions(+), 22 deletions(-) create mode 100644 debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch create mode 100644 debian/patches/patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch create mode 100644 debian/patches/patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch diff --git a/debian/bin/genpatch-pfkernel b/debian/bin/genpatch-pfkernel index ee762b8..2cb330e 100755 --- a/debian/bin/genpatch-pfkernel +++ b/debian/bin/genpatch-pfkernel @@ -7,7 +7,7 @@ w=$(git rev-parse --path-format=absolute --show-toplevel) ; : "${w:?}" ; cd "$w" dst='debian/patches/pf-tmp' src='../linux-extras' -branches='amd-pstate amd-rapl cpuidle crypto fixes kbuild pksm xfs zstd' +branches='amd-pstate amd-rapl cpuidle crypto fixes invlpgb kbuild pksm xfs zstd' if [ -d "${dst}" ] ; then rm -rf "${dst}" ; fi mkdir -p "${dst}" diff --git a/debian/changelog b/debian/changelog index d5bc7ca..935142d 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +linux (6.12.8-1) sid; urgency=medium + + * New upstream stable update: + https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.12.8 + + -- Konstantin Demin Thu, 02 Jan 2025 19:34:34 +0300 + linux (6.12.7-1) sid; urgency=medium * New upstream stable update: diff --git a/debian/config/amd64/config.cloud b/debian/config/amd64/config.cloud index fb8e5ae..78b6db3 100644 --- a/debian/config/amd64/config.cloud +++ b/debian/config/amd64/config.cloud @@ -1384,9 +1384,6 @@ CONFIG_BLK_DEV_PMEM=m # CONFIG_NVME_HWMON is not set # CONFIG_NVME_RDMA is not set CONFIG_NVME_FC=m -CONFIG_NVME_TCP=m -# CONFIG_NVME_TCP_TLS is not set -# CONFIG_NVME_HOST_AUTH is not set ## ## file: drivers/nvme/target/Kconfig @@ -1394,8 +1391,6 @@ CONFIG_NVME_TCP=m CONFIG_NVME_TARGET_RDMA=m CONFIG_NVME_TARGET_FC=m # CONFIG_NVME_TARGET_FCLOOP is not set -CONFIG_NVME_TARGET_TCP=m -# CONFIG_NVME_TARGET_TCP_TLS is not set ## ## file: drivers/of/Kconfig @@ -2495,8 +2490,6 @@ CONFIG_KEXEC_CORE=y CONFIG_LZ4HC_COMPRESS=m CONFIG_LZ4_COMPRESS=m CONFIG_MFD_CORE=m -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_MMU_GATHER_TABLE_FREE=y CONFIG_ND_BTT=m CONFIG_ND_PFN=m CONFIG_NETFS_SUPPORT=m diff --git a/debian/config/amd64/config.mobile b/debian/config/amd64/config.mobile index 21cb54a..edfbf5d 100644 --- a/debian/config/amd64/config.mobile +++ b/debian/config/amd64/config.mobile @@ -4892,14 +4892,11 @@ CONFIG_OF_PMEM=y ## CONFIG_NVME_HWMON=y # CONFIG_NVME_FC is not set -# CONFIG_NVME_TCP is not set -CONFIG_NVME_HOST_AUTH=y ## ## file: drivers/nvme/target/Kconfig ## # CONFIG_NVME_TARGET_FC is not set -# CONFIG_NVME_TARGET_TCP is not set ## ## file: drivers/nvmem/Kconfig @@ -8756,7 +8753,6 @@ CONFIG_NFC_ST_NCI=m CONFIG_NF_NAT_AMANDA=m CONFIG_NLS_UCS2_UTILS=m CONFIG_NVMEM_LAYOUTS=y -CONFIG_NVME_AUTH=m CONFIG_OF_ADDRESS=y CONFIG_OF_EARLY_FLATTREE=y CONFIG_OF_FLATTREE=y diff --git a/debian/config/amd64/config.vm b/debian/config/amd64/config.vm index 6274efa..26e2257 100644 --- a/debian/config/amd64/config.vm +++ b/debian/config/amd64/config.vm @@ -2355,9 +2355,6 @@ CONFIG_BLK_DEV_PMEM=m CONFIG_NVME_HWMON=y CONFIG_NVME_RDMA=m CONFIG_NVME_FC=m -CONFIG_NVME_TCP=m -# CONFIG_NVME_TCP_TLS is not set -# CONFIG_NVME_HOST_AUTH is not set ## ## file: drivers/nvme/target/Kconfig @@ -2365,8 +2362,6 @@ CONFIG_NVME_TCP=m CONFIG_NVME_TARGET_RDMA=m CONFIG_NVME_TARGET_FC=m # CONFIG_NVME_TARGET_FCLOOP is not set -CONFIG_NVME_TARGET_TCP=m -# CONFIG_NVME_TARGET_TCP_TLS is not set ## ## file: drivers/of/Kconfig @@ -4068,8 +4063,6 @@ CONFIG_LZ4_COMPRESS=m CONFIG_MAPPING_DIRTY_HELPERS=y CONFIG_MCTP_FLOWS=y CONFIG_MFD_CORE=m -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_MMU_GATHER_TABLE_FREE=y CONFIG_MOUSE_PS2_SMBUS=y CONFIG_ND_BTT=m CONFIG_ND_PFN=m diff --git a/debian/config/config b/debian/config/config index 5a27038..57062d2 100644 --- a/debian/config/config +++ b/debian/config/config @@ -1098,6 +1098,9 @@ CONFIG_NVDIMM_DAX=y CONFIG_BLK_DEV_NVME=m CONFIG_NVME_MULTIPATH=y CONFIG_NVME_VERBOSE_ERRORS=y +CONFIG_NVME_TCP=m +CONFIG_NVME_TCP_TLS=y +CONFIG_NVME_HOST_AUTH=y ## ## file: drivers/nvme/target/Kconfig @@ -1106,7 +1109,9 @@ CONFIG_NVME_TARGET=m # CONFIG_NVME_TARGET_DEBUGFS is not set CONFIG_NVME_TARGET_PASSTHRU=y CONFIG_NVME_TARGET_LOOP=m -# CONFIG_NVME_TARGET_AUTH is not set +CONFIG_NVME_TARGET_TCP=m +CONFIG_NVME_TARGET_TCP_TLS=y +CONFIG_NVME_TARGET_AUTH=y ## ## file: drivers/nvmem/Kconfig @@ -3941,6 +3946,8 @@ CONFIG_MLX4_CORE=m CONFIG_MMCONF_FAM10H=y CONFIG_MMU=y CONFIG_MMU_GATHER_MERGE_VMAS=y +CONFIG_MMU_GATHER_RCU_TABLE_FREE=y +CONFIG_MMU_GATHER_TABLE_FREE=y CONFIG_MMU_LAZY_TLB_REFCOUNT=y CONFIG_MMU_NOTIFIER=y CONFIG_MODULES_TREE_LOOKUP=y @@ -4015,8 +4022,10 @@ CONFIG_NR_CPUS_RANGE_END=512 CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_NUMA_MEMBLKS=y CONFIG_NVDIMM_KEYS=y +CONFIG_NVME_AUTH=m CONFIG_NVME_CORE=m CONFIG_NVME_FABRICS=m +CONFIG_NVME_KEYRING=m CONFIG_OBJTOOL=y CONFIG_OID_REGISTRY=y CONFIG_OLD_SIGSUSPEND3=y diff --git a/debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch b/debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch new file mode 100644 index 0000000..0d890b5 --- /dev/null +++ b/debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch @@ -0,0 +1,33 @@ +From: Scott Mayhew +Date: Tue, 10 Dec 2024 07:25:54 -0500 +Subject: nfsd: fix legacy client tracking initialization +Origin: https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git/commit/?h=nfsd-next&id=45cd8c0c13fe5c9f1b926bd307df431f8f1b8a16 +Bug: https://bugzilla.kernel.org/show_bug.cgi?id=219580 +Bug-Debian: https://bugs.debian.org/1087900 + +Get rid of the nfsd4_legacy_tracking_ops->init() call in +check_for_legacy_methods(). That will be handled in the caller +(nfsd4_client_tracking_init()). Otherwise, we'll wind up calling +nfsd4_legacy_tracking_ops->init() twice, and the second time we'll +trigger the BUG_ON() in nfsd4_init_recdir(). + +Fixes: 74fd48739d04 ("nfsd: new Kconfig option for legacy client tracking") +Reported-by: Jur van der Burg +Link: https://bugzilla.kernel.org/show_bug.cgi?id=219580 +Signed-off-by: Scott Mayhew +Reviewed-by: Jeff Layton +Signed-off-by: Chuck Lever +--- + fs/nfsd/nfs4recover.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/fs/nfsd/nfs4recover.c ++++ b/fs/nfsd/nfs4recover.c +@@ -2052,7 +2052,6 @@ static inline int check_for_legacy_metho + path_put(&path); + if (status) + return -ENOTDIR; +- status = nn->client_tracking_ops->init(net); + } + return status; + } diff --git a/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch b/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch index 3b75384..4ba7b9f 100644 --- a/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch +++ b/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch @@ -34,7 +34,7 @@ Signed-off-by: Serge Hallyn /* * Minimum number of threads to boot the kernel */ -@@ -2158,6 +2164,10 @@ __latent_entropy struct task_struct *cop +@@ -2157,6 +2163,10 @@ __latent_entropy struct task_struct *cop if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -45,7 +45,7 @@ Signed-off-by: Serge Hallyn /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -3311,6 +3321,12 @@ int ksys_unshare(unsigned long unshare_f +@@ -3310,6 +3320,12 @@ int ksys_unshare(unsigned long unshare_f if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; diff --git a/debian/patches/patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch b/debian/patches/patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch new file mode 100644 index 0000000..1fba26d --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch @@ -0,0 +1,60 @@ +From 60fbdd9e9dc7074d4cd30ada3ba9547d5c007702 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:02 -0500 +Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional + +Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using +paravirt, and not when running on bare metal. + +There is no real good reason to do things differently for +each setup. Make them all the same. + +After this change, the synchronization between get_user_pages_fast +and page table freeing is handled by RCU, which prevents page tables +from being reused for other data while get_user_pages_fast is walking +them. + +This allows us to invalidate page tables while other CPUs have +interrupts disabled. + +Signed-off-by: Rik van Riel +Suggested-by: Peter Zijlstra +--- + arch/x86/Kconfig | 2 +- + arch/x86/kernel/paravirt.c | 7 +------ + 2 files changed, 2 insertions(+), 7 deletions(-) + +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -270,7 +270,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP +- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT ++ select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS + select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_REGS_AND_STACK_ACCESS_API +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void) + static_branch_enable(&virt_spin_lock_key); + } + +-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- tlb_remove_page(tlb, table); +-} +- + struct static_key paravirt_steal_enabled; + struct static_key paravirt_steal_rq_enabled; + +@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops = + .mmu.flush_tlb_kernel = native_flush_tlb_global, + .mmu.flush_tlb_one_user = native_flush_tlb_one_user, + .mmu.flush_tlb_multi = native_flush_tlb_multi, +- .mmu.tlb_remove_table = native_tlb_remove_table, ++ .mmu.tlb_remove_table = tlb_remove_table, + + .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, diff --git a/debian/patches/patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch b/debian/patches/patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch new file mode 100644 index 0000000..4178de2 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch @@ -0,0 +1,137 @@ +From 8966aff4928c0bc3aa79b8729d74da5ea782f73a Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:03 -0500 +Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call + +Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table. + +Get rid of the indirection by simply calling tlb_remove_table directly, +and not going through the paravirt function pointers. + +Signed-off-by: Rik van Riel +Suggested-by: Qi Zheng +--- + arch/x86/hyperv/mmu.c | 1 - + arch/x86/include/asm/paravirt.h | 5 ----- + arch/x86/include/asm/paravirt_types.h | 2 -- + arch/x86/kernel/kvm.c | 1 - + arch/x86/kernel/paravirt.c | 1 - + arch/x86/mm/pgtable.c | 16 ++++------------ + arch/x86/xen/mmu_pv.c | 1 - + 7 files changed, 4 insertions(+), 23 deletions(-) + +--- a/arch/x86/hyperv/mmu.c ++++ b/arch/x86/hyperv/mmu.c +@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void) + + pr_info("Using hypercall for remote TLB flush\n"); + pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; +- pv_ops.mmu.tlb_remove_table = tlb_remove_table; + } +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con + PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); + } + +-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); +-} +- + static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) + { + PVOP_VCALL1(mmu.exit_mmap, mm); +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -136,8 +136,6 @@ struct pv_mmu_ops { + void (*flush_tlb_multi)(const struct cpumask *cpus, + const struct flush_tlb_info *info); + +- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); +- + /* Hook for intercepting the destruction of an mm_struct. */ + void (*exit_mmap)(struct mm_struct *mm); + void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void) + #ifdef CONFIG_SMP + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; +- pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); + } + +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops = + .mmu.flush_tlb_kernel = native_flush_tlb_global, + .mmu.flush_tlb_one_user = native_flush_tlb_one_user, + .mmu.flush_tlb_multi = native_flush_tlb_multi, +- .mmu.tlb_remove_table = tlb_remove_table, + + .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask); + #define PGTABLE_HIGHMEM 0 + #endif + +-#ifndef CONFIG_PARAVIRT +-static inline +-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- tlb_remove_page(tlb, table); +-} +-#endif +- + gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; + + pgtable_t pte_alloc_one(struct mm_struct *mm) +@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather * + { + pagetable_pte_dtor(page_ptdesc(pte)); + paravirt_release_pte(page_to_pfn(pte)); +- paravirt_tlb_remove_table(tlb, pte); ++ tlb_remove_table(tlb, pte); + } + + #if CONFIG_PGTABLE_LEVELS > 2 +@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather * + tlb->need_flush_all = 1; + #endif + pagetable_pmd_dtor(ptdesc); +- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); ++ tlb_remove_table(tlb, ptdesc_page(ptdesc)); + } + + #if CONFIG_PGTABLE_LEVELS > 3 +@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather * + + pagetable_pud_dtor(ptdesc); + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); +- paravirt_tlb_remove_table(tlb, virt_to_page(pud)); ++ tlb_remove_table(tlb, virt_to_page(pud)); + } + + #if CONFIG_PGTABLE_LEVELS > 4 + void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) + { + paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); +- paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); ++ tlb_remove_table(tlb, virt_to_page(p4d)); + } + #endif /* CONFIG_PGTABLE_LEVELS > 4 */ + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -2137,7 +2137,6 @@ static const typeof(pv_ops) xen_mmu_ops + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_one_user = xen_flush_tlb_one_user, + .flush_tlb_multi = xen_flush_tlb_multi, +- .tlb_remove_table = tlb_remove_table, + + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, diff --git a/debian/patches/patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch b/debian/patches/patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch new file mode 100644 index 0000000..cb5131a --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch @@ -0,0 +1,23 @@ +From efde57842082e36ab2e2be5a11c7b06ff9e18b3d Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:04 -0500 +Subject: x86/mm: add X86_FEATURE_INVLPGB definition. + +Add the INVPLGB CPUID definition, allowing the kernel to recognize +whether the CPU supports the INVLPGB instruction. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/cpufeatures.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -335,6 +335,7 @@ + #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ + #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ ++#define X86_FEATURE_INVLPGB (13*32+ 3) /* "invlpgb" INVLPGB instruction */ + #define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ + #define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ + #define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ diff --git a/debian/patches/patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch b/debian/patches/patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch new file mode 100644 index 0000000..fc2995c --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch @@ -0,0 +1,57 @@ +From 98953e10e342ceea1dc877cfb63318fa85879a59 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:05 -0500 +Subject: x86/mm: get INVLPGB count max from CPUID + +The CPU advertises the maximum number of pages that can be shot down +with one INVLPGB instruction in the CPUID data. + +Save that information for later use. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/tlbflush.h | 1 + + arch/x86/kernel/cpu/amd.c | 8 ++++++++ + arch/x86/kernel/setup.c | 4 ++++ + 3 files changed, 13 insertions(+) + +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -182,6 +182,7 @@ static inline void cr4_init_shadow(void) + + extern unsigned long mmu_cr4_features; + extern u32 *trampoline_cr4_features; ++extern u16 invlpgb_count_max; + + extern void initialize_tlbstate_and_flush(void); + +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -1135,6 +1135,14 @@ static void cpu_detect_tlb_amd(struct cp + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; ++ ++ if (c->extended_cpuid_level < 0x80000008) ++ return; ++ ++ cpuid(0x80000008, &eax, &ebx, &ecx, &edx); ++ ++ /* Max number of pages INVLPGB can invalidate in one shot */ ++ invlpgb_count_max = (edx & 0xffff) + 1; + } + + static const struct cpu_dev amd_cpu_dev = { +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -138,6 +138,10 @@ __visible unsigned long mmu_cr4_features + __visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; + #endif + ++#ifdef CONFIG_CPU_SUP_AMD ++u16 invlpgb_count_max __ro_after_init; ++#endif ++ + #ifdef CONFIG_IMA + static phys_addr_t ima_kexec_buffer_phys; + static size_t ima_kexec_buffer_size; diff --git a/debian/patches/patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch b/debian/patches/patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch new file mode 100644 index 0000000..f116ad6 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch @@ -0,0 +1,121 @@ +From bc9d1fa1bd32dca78f38bd2a8557e7fc638308bd Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:06 -0500 +Subject: x86/mm: add INVLPGB support code + +Add invlpgb.h with the helper functions and definitions needed to use +broadcast TLB invalidation on AMD EPYC 3 and newer CPUs. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/invlpgb.h | 93 +++++++++++++++++++++++++++++++++ + arch/x86/include/asm/tlbflush.h | 1 + + 2 files changed, 94 insertions(+) + create mode 100644 arch/x86/include/asm/invlpgb.h + +--- /dev/null ++++ b/arch/x86/include/asm/invlpgb.h +@@ -0,0 +1,93 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_X86_INVLPGB ++#define _ASM_X86_INVLPGB ++ ++#include ++ ++/* ++ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. ++ * ++ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can ++ * be done in a parallel fashion. ++ * ++ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from ++ * this CPU have completed. ++ */ ++static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, ++ int extra_count, bool pmd_stride, unsigned long flags) ++{ ++ u64 rax = addr | flags; ++ u32 ecx = (pmd_stride << 31) | extra_count; ++ u32 edx = (pcid << 16) | asid; ++ ++ asm volatile("invlpgb" : : "a" (rax), "c" (ecx), "d" (edx)); ++} ++ ++/* ++ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination ++ * of the three. For example: ++ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address ++ * - INVLPGB_PCID: invalidate all TLB entries matching the PCID ++ * ++ * The first can be used to invalidate (kernel) mappings at a particular ++ * address across all processes. ++ * ++ * The latter invalidates all TLB entries matching a PCID. ++ */ ++#define INVLPGB_VA BIT(0) ++#define INVLPGB_PCID BIT(1) ++#define INVLPGB_ASID BIT(2) ++#define INVLPGB_INCLUDE_GLOBAL BIT(3) ++#define INVLPGB_FINAL_ONLY BIT(4) ++#define INVLPGB_INCLUDE_NESTED BIT(5) ++ ++/* Flush all mappings for a given pcid and addr, not including globals. */ ++static inline void invlpgb_flush_user(unsigned long pcid, ++ unsigned long addr) ++{ ++ __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA); ++} ++ ++static inline void invlpgb_flush_user_nr(unsigned long pcid, unsigned long addr, ++ int nr, bool pmd_stride) ++{ ++ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); ++} ++ ++/* Flush all mappings for a given ASID, not including globals. */ ++static inline void invlpgb_flush_single_asid(unsigned long asid) ++{ ++ __invlpgb(asid, 0, 0, 0, 0, INVLPGB_ASID); ++} ++ ++/* Flush all mappings for a given PCID, not including globals. */ ++static inline void invlpgb_flush_single_pcid(unsigned long pcid) ++{ ++ __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID); ++} ++ ++/* Flush all mappings, including globals, for all PCIDs. */ ++static inline void invlpgb_flush_all(void) ++{ ++ __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL); ++} ++ ++/* Flush addr, including globals, for all PCIDs. */ ++static inline void invlpgb_flush_addr(unsigned long addr, int nr) ++{ ++ __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL); ++} ++ ++/* Flush all mappings for all PCIDs except globals. */ ++static inline void invlpgb_flush_all_nonglobals(void) ++{ ++ __invlpgb(0, 0, 0, 0, 0, 0); ++} ++ ++/* Wait for INVLPGB originated by this CPU to complete. */ ++static inline void tlbsync(void) ++{ ++ asm volatile("tlbsync"); ++} ++ ++#endif /* _ASM_X86_INVLPGB */ +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include + #include diff --git a/debian/patches/patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch b/debian/patches/patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch new file mode 100644 index 0000000..56a6df3 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch @@ -0,0 +1,61 @@ +From ffd834c7140dc5fcaf96161c6d8c4601bb700afe Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:07 -0500 +Subject: x86/mm: use INVLPGB for kernel TLB flushes + +Use broadcast TLB invalidation for kernel addresses when available. + +This stops us from having to send IPIs for kernel TLB flushes. + +Signed-off-by: Rik van Riel +--- + arch/x86/mm/tlb.c | 31 +++++++++++++++++++++++++++++++ + 1 file changed, 31 insertions(+) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1048,6 +1048,32 @@ void flush_tlb_all(void) + on_each_cpu(do_flush_tlb_all, NULL, 1); + } + ++static void broadcast_kernel_range_flush(unsigned long start, unsigned long end) ++{ ++ unsigned long addr; ++ unsigned long maxnr = invlpgb_count_max; ++ unsigned long threshold = tlb_single_page_flush_ceiling * maxnr; ++ ++ /* ++ * TLBSYNC only waits for flushes originating on the same CPU. ++ * Disabling migration allows us to wait on all flushes. ++ */ ++ guard(preempt)(); ++ ++ if (end == TLB_FLUSH_ALL || ++ (end - start) > threshold << PAGE_SHIFT) { ++ invlpgb_flush_all(); ++ } else { ++ unsigned long nr; ++ for (addr = start; addr < end; addr += nr << PAGE_SHIFT) { ++ nr = min((end - addr) >> PAGE_SHIFT, maxnr); ++ invlpgb_flush_addr(addr, nr); ++ } ++ } ++ ++ tlbsync(); ++} ++ + static void do_kernel_range_flush(void *info) + { + struct flush_tlb_info *f = info; +@@ -1060,6 +1086,11 @@ static void do_kernel_range_flush(void * + + void flush_tlb_kernel_range(unsigned long start, unsigned long end) + { ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ broadcast_kernel_range_flush(start, end); ++ return; ++ } ++ + /* Balance as user space task's flush, a bit conservative */ + if (end == TLB_FLUSH_ALL || + (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { diff --git a/debian/patches/patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch b/debian/patches/patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch new file mode 100644 index 0000000..fe9ed69 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch @@ -0,0 +1,28 @@ +From 13fac8226036456c15c517c1dd77be5109a61da2 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:08 -0500 +Subject: x86/tlb: use INVLPGB in flush_tlb_all + +The flush_tlb_all() function is not used a whole lot, but we might +as well use broadcast TLB flushing there, too. + +Signed-off-by: Rik van Riel +--- + arch/x86/mm/tlb.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1045,6 +1045,12 @@ static void do_flush_tlb_all(void *info) + void flush_tlb_all(void) + { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ guard(preempt)(); ++ invlpgb_flush_all(); ++ tlbsync(); ++ return; ++ } + on_each_cpu(do_flush_tlb_all, NULL, 1); + } + diff --git a/debian/patches/patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch b/debian/patches/patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch new file mode 100644 index 0000000..4f33505 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch @@ -0,0 +1,36 @@ +From 765d531296765e7fb2888c70cb56c0e25b459231 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:09 -0500 +Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing + +In the page reclaim code, we only track the CPU(s) where the TLB needs +to be flushed, rather than all the individual mappings that may be getting +invalidated. + +Use broadcast TLB flushing when that is available. + +Signed-off-by: Rik van Riel +--- + arch/x86/mm/tlb.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1281,8 +1281,16 @@ EXPORT_SYMBOL_GPL(__flush_tlb_all); + void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) + { + struct flush_tlb_info *info; ++ int cpu; + +- int cpu = get_cpu(); ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ guard(preempt)(); ++ invlpgb_flush_all_nonglobals(); ++ tlbsync(); ++ return; ++ } ++ ++ cpu = get_cpu(); + + info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, + TLB_GENERATION_INVALID); diff --git a/debian/patches/patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch b/debian/patches/patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch new file mode 100644 index 0000000..7786212 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch @@ -0,0 +1,508 @@ +From 8b23125a3200a330fb407133f33aeb9ad3232603 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:10 -0500 +Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded + processes + +Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3 +and newer CPUs. + +In order to not exhaust PCID space, and keep TLB flushes local for single +threaded processes, we only hand out broadcast ASIDs to processes active on +3 or more CPUs, and gradually increase the threshold as broadcast ASID space +is depleted. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/mmu.h | 6 + + arch/x86/include/asm/mmu_context.h | 12 ++ + arch/x86/include/asm/tlbflush.h | 17 ++ + arch/x86/mm/tlb.c | 310 ++++++++++++++++++++++++++++- + 4 files changed, 336 insertions(+), 9 deletions(-) + +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -46,6 +46,12 @@ typedef struct { + unsigned long flags; + #endif + ++#ifdef CONFIG_CPU_SUP_AMD ++ struct list_head broadcast_asid_list; ++ u16 broadcast_asid; ++ bool asid_transition; ++#endif ++ + #ifdef CONFIG_ADDRESS_MASKING + /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */ + unsigned long lam_cr3_mask; +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s + #define enter_lazy_tlb enter_lazy_tlb + extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); + ++extern void destroy_context_free_broadcast_asid(struct mm_struct *mm); ++ + /* + * Init a new mm. Used on mm copies, like at fork() + * and on mm's that are brand-new, like at execve(). +@@ -160,6 +162,13 @@ static inline int init_new_context(struc + mm->context.execute_only_pkey = -1; + } + #endif ++ ++#ifdef CONFIG_CPU_SUP_AMD ++ INIT_LIST_HEAD(&mm->context.broadcast_asid_list); ++ mm->context.broadcast_asid = 0; ++ mm->context.asid_transition = false; ++#endif ++ + mm_reset_untag_mask(mm); + init_new_context_ldt(mm); + return 0; +@@ -169,6 +178,9 @@ static inline int init_new_context(struc + static inline void destroy_context(struct mm_struct *mm) + { + destroy_context_ldt(mm); ++#ifdef CONFIG_CPU_SUP_AMD ++ destroy_context_free_broadcast_asid(mm); ++#endif + } + + extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -65,6 +65,23 @@ static inline void cr4_clear_bits(unsign + */ + #define TLB_NR_DYN_ASIDS 6 + ++#ifdef CONFIG_CPU_SUP_AMD ++#define is_dyn_asid(asid) (asid) < TLB_NR_DYN_ASIDS ++#define is_broadcast_asid(asid) (asid) >= TLB_NR_DYN_ASIDS ++#define in_asid_transition(info) (info->mm && info->mm->context.asid_transition) ++#define mm_broadcast_asid(mm) (mm->context.broadcast_asid) ++#else ++#define is_dyn_asid(asid) true ++#define is_broadcast_asid(asid) false ++#define in_asid_transition(info) false ++#define mm_broadcast_asid(mm) 0 ++ ++inline bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid) ++{ ++ return false; ++} ++#endif ++ + struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -74,13 +74,15 @@ + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] +- * the canonical identifier for an mm ++ * the canonical identifier for an mm, dynamically allocated on each CPU ++ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1] ++ * the canonical, global identifier for an mm, identical across all CPUs + * +- * kPCID - [1, TLB_NR_DYN_ASIDS] ++ * kPCID - [1, MAX_ASID_AVAILABLE] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * +- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] ++ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. +@@ -225,6 +227,18 @@ static void choose_new_asid(struct mm_st + return; + } + ++ /* ++ * TLB consistency for this ASID is maintained with INVLPGB; ++ * TLB flushes happen even while the process isn't running. ++ */ ++#ifdef CONFIG_CPU_SUP_AMD ++ if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_broadcast_asid(next)) { ++ *new_asid = mm_broadcast_asid(next); ++ *need_flush = false; ++ return; ++ } ++#endif ++ + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + +@@ -251,6 +265,245 @@ static void choose_new_asid(struct mm_st + *need_flush = true; + } + ++#ifdef CONFIG_CPU_SUP_AMD ++/* ++ * Logic for AMD INVLPGB support. ++ */ ++static DEFINE_RAW_SPINLOCK(broadcast_asid_lock); ++static u16 last_broadcast_asid = TLB_NR_DYN_ASIDS; ++static DECLARE_BITMAP(broadcast_asid_used, MAX_ASID_AVAILABLE) = { 0 }; ++static LIST_HEAD(broadcast_asid_list); ++static int broadcast_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1; ++ ++static void reset_broadcast_asid_space(void) ++{ ++ mm_context_t *context; ++ ++ lockdep_assert_held(&broadcast_asid_lock); ++ ++ /* ++ * Flush once when we wrap around the ASID space, so we won't need ++ * to flush every time we allocate an ASID for boradcast flushing. ++ */ ++ invlpgb_flush_all_nonglobals(); ++ tlbsync(); ++ ++ /* ++ * Leave the currently used broadcast ASIDs set in the bitmap, since ++ * those cannot be reused before the next wraparound and flush.. ++ */ ++ bitmap_clear(broadcast_asid_used, 0, MAX_ASID_AVAILABLE); ++ list_for_each_entry(context, &broadcast_asid_list, broadcast_asid_list) ++ __set_bit(context->broadcast_asid, broadcast_asid_used); ++ ++ last_broadcast_asid = TLB_NR_DYN_ASIDS; ++} ++ ++static u16 get_broadcast_asid(void) ++{ ++ lockdep_assert_held(&broadcast_asid_lock); ++ ++ do { ++ u16 start = last_broadcast_asid; ++ u16 asid = find_next_zero_bit(broadcast_asid_used, MAX_ASID_AVAILABLE, start); ++ ++ if (asid >= MAX_ASID_AVAILABLE) { ++ reset_broadcast_asid_space(); ++ continue; ++ } ++ ++ /* Try claiming this broadcast ASID. */ ++ if (!test_and_set_bit(asid, broadcast_asid_used)) { ++ last_broadcast_asid = asid; ++ return asid; ++ } ++ } while (1); ++} ++ ++/* ++ * Returns true if the mm is transitioning from a CPU-local ASID to a broadcast ++ * (INVLPGB) ASID, or the other way around. ++ */ ++static bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid) ++{ ++ u16 broadcast_asid = mm_broadcast_asid(next); ++ ++ if (broadcast_asid && prev_asid != broadcast_asid) ++ return true; ++ ++ if (!broadcast_asid && is_broadcast_asid(prev_asid)) ++ return true; ++ ++ return false; ++} ++ ++void destroy_context_free_broadcast_asid(struct mm_struct *mm) ++{ ++ if (!mm->context.broadcast_asid) ++ return; ++ ++ guard(raw_spinlock_irqsave)(&broadcast_asid_lock); ++ mm->context.broadcast_asid = 0; ++ list_del(&mm->context.broadcast_asid_list); ++ broadcast_asid_available++; ++} ++ ++static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold) ++{ ++ int count = 0; ++ int cpu; ++ ++ if (cpumask_weight(mm_cpumask(mm)) <= threshold) ++ return false; ++ ++ for_each_cpu(cpu, mm_cpumask(mm)) { ++ /* Skip the CPUs that aren't really running this process. */ ++ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm) ++ continue; ++ ++ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) ++ continue; ++ ++ if (++count > threshold) ++ return true; ++ } ++ return false; ++} ++ ++/* ++ * Assign a broadcast ASID to the current process, protecting against ++ * races between multiple threads in the process. ++ */ ++static void use_broadcast_asid(struct mm_struct *mm) ++{ ++ guard(raw_spinlock_irqsave)(&broadcast_asid_lock); ++ ++ /* This process is already using broadcast TLB invalidation. */ ++ if (mm->context.broadcast_asid) ++ return; ++ ++ mm->context.broadcast_asid = get_broadcast_asid(); ++ mm->context.asid_transition = true; ++ list_add(&mm->context.broadcast_asid_list, &broadcast_asid_list); ++ broadcast_asid_available--; ++} ++ ++/* ++ * Figure out whether to assign a broadcast (global) ASID to a process. ++ * We vary the threshold by how empty or full broadcast ASID space is. ++ * 1/4 full: >= 4 active threads ++ * 1/2 full: >= 8 active threads ++ * 3/4 full: >= 16 active threads ++ * 7/8 full: >= 32 active threads ++ * etc ++ * ++ * This way we should never exhaust the broadcast ASID space, even on very ++ * large systems, and the processes with the largest number of active ++ * threads should be able to use broadcast TLB invalidation. ++ */ ++#define HALFFULL_THRESHOLD 8 ++static bool meets_broadcast_asid_threshold(struct mm_struct *mm) ++{ ++ int avail = broadcast_asid_available; ++ int threshold = HALFFULL_THRESHOLD; ++ ++ if (!avail) ++ return false; ++ ++ if (avail > MAX_ASID_AVAILABLE * 3 / 4) { ++ threshold = HALFFULL_THRESHOLD / 4; ++ } else if (avail > MAX_ASID_AVAILABLE / 2) { ++ threshold = HALFFULL_THRESHOLD / 2; ++ } else if (avail < MAX_ASID_AVAILABLE / 3) { ++ do { ++ avail *= 2; ++ threshold *= 2; ++ } while ((avail + threshold) < MAX_ASID_AVAILABLE / 2); ++ } ++ ++ return mm_active_cpus_exceeds(mm, threshold); ++} ++ ++static void count_tlb_flush(struct mm_struct *mm) ++{ ++ if (!static_cpu_has(X86_FEATURE_INVLPGB)) ++ return; ++ ++ /* Check every once in a while. */ ++ if ((current->pid & 0x1f) != (jiffies & 0x1f)) ++ return; ++ ++ if (meets_broadcast_asid_threshold(mm)) ++ use_broadcast_asid(mm); ++} ++ ++static void finish_asid_transition(struct flush_tlb_info *info) ++{ ++ struct mm_struct *mm = info->mm; ++ int bc_asid = mm_broadcast_asid(mm); ++ int cpu; ++ ++ if (!mm->context.asid_transition) ++ return; ++ ++ for_each_cpu(cpu, mm_cpumask(mm)) { ++ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm) ++ continue; ++ ++ /* ++ * If at least one CPU is not using the broadcast ASID yet, ++ * send a TLB flush IPI. The IPI should cause stragglers ++ * to transition soon. ++ */ ++ if (per_cpu(cpu_tlbstate.loaded_mm_asid, cpu) != bc_asid) { ++ flush_tlb_multi(mm_cpumask(info->mm), info); ++ return; ++ } ++ } ++ ++ /* All the CPUs running this process are using the broadcast ASID. */ ++ mm->context.asid_transition = 0; ++} ++ ++static void broadcast_tlb_flush(struct flush_tlb_info *info) ++{ ++ bool pmd = info->stride_shift == PMD_SHIFT; ++ unsigned long maxnr = invlpgb_count_max; ++ unsigned long asid = info->mm->context.broadcast_asid; ++ unsigned long addr = info->start; ++ unsigned long nr; ++ ++ /* Flushing multiple pages at once is not supported with 1GB pages. */ ++ if (info->stride_shift > PMD_SHIFT) ++ maxnr = 1; ++ ++ if (info->end == TLB_FLUSH_ALL) { ++ invlpgb_flush_single_pcid(kern_pcid(asid)); ++ /* Do any CPUs supporting INVLPGB need PTI? */ ++ if (static_cpu_has(X86_FEATURE_PTI)) ++ invlpgb_flush_single_pcid(user_pcid(asid)); ++ } else do { ++ /* ++ * Calculate how many pages can be flushed at once; if the ++ * remainder of the range is less than one page, flush one. ++ */ ++ nr = min(maxnr, (info->end - addr) >> info->stride_shift); ++ nr = max(nr, 1); ++ ++ invlpgb_flush_user_nr(kern_pcid(asid), addr, nr, pmd); ++ /* Do any CPUs supporting INVLPGB need PTI? */ ++ if (static_cpu_has(X86_FEATURE_PTI)) ++ invlpgb_flush_user_nr(user_pcid(asid), addr, nr, pmd); ++ addr += nr << info->stride_shift; ++ } while (addr < info->end); ++ ++ finish_asid_transition(info); ++ ++ /* Wait for the INVLPGBs kicked off above to finish. */ ++ tlbsync(); ++} ++#endif /* CONFIG_CPU_SUP_AMD */ ++ + /* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. +@@ -556,8 +809,9 @@ void switch_mm_irqs_off(struct mm_struct + */ + if (prev == next) { + /* Not actually switching mm's */ +- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != +- next->context.ctx_id); ++ if (is_dyn_asid(prev_asid)) ++ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != ++ next->context.ctx_id); + + /* + * If this races with another thread that enables lam, 'new_lam' +@@ -574,6 +828,23 @@ void switch_mm_irqs_off(struct mm_struct + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* ++ * Check if the current mm is transitioning to a new ASID. ++ */ ++ if (needs_broadcast_asid_reload(next, prev_asid)) { ++ next_tlb_gen = atomic64_read(&next->context.tlb_gen); ++ ++ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); ++ goto reload_tlb; ++ } ++ ++ /* ++ * Broadcast TLB invalidation keeps this PCID up to date ++ * all the time. ++ */ ++ if (is_broadcast_asid(prev_asid)) ++ return; ++ ++ /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same + * process. No TLB flush required. +@@ -629,8 +900,10 @@ void switch_mm_irqs_off(struct mm_struct + barrier(); + } + ++reload_tlb: + new_lam = mm_lam_cr3_mask(next); + if (need_flush) { ++ VM_BUG_ON(is_broadcast_asid(new_asid)); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, new_lam, true); +@@ -749,7 +1022,7 @@ static void flush_tlb_func(void *info) + const struct flush_tlb_info *f = info; + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); +- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); ++ u64 local_tlb_gen; + bool local = smp_processor_id() == f->initiating_cpu; + unsigned long nr_invalidate = 0; + u64 mm_tlb_gen; +@@ -769,6 +1042,16 @@ static void flush_tlb_func(void *info) + if (unlikely(loaded_mm == &init_mm)) + return; + ++ /* Reload the ASID if transitioning into or out of a broadcast ASID */ ++ if (needs_broadcast_asid_reload(loaded_mm, loaded_mm_asid)) { ++ switch_mm_irqs_off(NULL, loaded_mm, NULL); ++ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); ++ } ++ ++ /* Broadcast ASIDs are always kept up to date with INVLPGB. */ ++ if (is_broadcast_asid(loaded_mm_asid)) ++ return; ++ + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + +@@ -786,6 +1069,8 @@ static void flush_tlb_func(void *info) + return; + } + ++ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); ++ + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && + f->new_tlb_gen <= local_tlb_gen)) { + /* +@@ -926,7 +1211,7 @@ STATIC_NOPV void native_flush_tlb_multi( + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ +- if (info->freed_tables) ++ if (info->freed_tables || in_asid_transition(info)) + on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); + else + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, +@@ -998,14 +1283,18 @@ void flush_tlb_mm_range(struct mm_struct + bool freed_tables) + { + struct flush_tlb_info *info; ++ unsigned long threshold = tlb_single_page_flush_ceiling; + u64 new_tlb_gen; + int cpu; + ++ if (static_cpu_has(X86_FEATURE_INVLPGB)) ++ threshold *= invlpgb_count_max; ++ + cpu = get_cpu(); + + /* Should we flush just the requested range? */ + if ((end == TLB_FLUSH_ALL) || +- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { ++ ((end - start) >> stride_shift) > threshold) { + start = 0; + end = TLB_FLUSH_ALL; + } +@@ -1021,8 +1310,11 @@ void flush_tlb_mm_range(struct mm_struct + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ +- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { ++ if (IS_ENABLED(CONFIG_CPU_SUP_AMD) && mm_broadcast_asid(mm)) { ++ broadcast_tlb_flush(info); ++ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + flush_tlb_multi(mm_cpumask(mm), info); ++ count_tlb_flush(mm); + } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + lockdep_assert_irqs_enabled(); + local_irq_disable(); diff --git a/debian/patches/patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch b/debian/patches/patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch new file mode 100644 index 0000000..42e7f79 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch @@ -0,0 +1,126 @@ +From 1767a2786ebbe3451f973df44485309c2a8fd8a5 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:11 -0500 +Subject: x86,tlb: do targeted broadcast flushing from tlbbatch code + +Instead of doing a system-wide TLB flush from arch_tlbbatch_flush, +queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending. + +This also allows us to avoid adding the CPUs of processes using broadcast +flushing to the batch->cpumask, and will hopefully further reduce TLB +flushing from the reclaim and compaction paths. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/tlbbatch.h | 1 + + arch/x86/include/asm/tlbflush.h | 12 +++------ + arch/x86/mm/tlb.c | 48 ++++++++++++++++++++++++++------- + 3 files changed, 42 insertions(+), 19 deletions(-) + +--- a/arch/x86/include/asm/tlbbatch.h ++++ b/arch/x86/include/asm/tlbbatch.h +@@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch { + * the PFNs being flushed.. + */ + struct cpumask cpumask; ++ bool used_invlpgb; + }; + + #endif /* _ARCH_X86_TLBBATCH_H */ +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -296,21 +296,15 @@ static inline u64 inc_mm_tlb_gen(struct + return atomic64_inc_return(&mm->context.tlb_gen); + } + +-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, +- struct mm_struct *mm, +- unsigned long uaddr) +-{ +- inc_mm_tlb_gen(mm); +- cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); +- mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); +-} +- + static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) + { + flush_tlb_mm(mm); + } + + extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); ++extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, ++ struct mm_struct *mm, ++ unsigned long uaddr); + + static inline bool pte_flags_need_flush(unsigned long oldflags, + unsigned long newflags, +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1573,16 +1573,7 @@ EXPORT_SYMBOL_GPL(__flush_tlb_all); + void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) + { + struct flush_tlb_info *info; +- int cpu; +- +- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { +- guard(preempt)(); +- invlpgb_flush_all_nonglobals(); +- tlbsync(); +- return; +- } +- +- cpu = get_cpu(); ++ int cpu = get_cpu(); + + info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, + TLB_GENERATION_INVALID); +@@ -1600,12 +1591,49 @@ void arch_tlbbatch_flush(struct arch_tlb + local_irq_enable(); + } + ++ /* ++ * If we issued (asynchronous) INVLPGB flushes, wait for them here. ++ * The cpumask above contains only CPUs that were running tasks ++ * not using broadcast TLB flushing. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) { ++ tlbsync(); ++ migrate_enable(); ++ batch->used_invlpgb = false; ++ } ++ + cpumask_clear(&batch->cpumask); + + put_flush_tlb_info(); + put_cpu(); + } + ++void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, ++ struct mm_struct *mm, ++ unsigned long uaddr) ++{ ++ if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_broadcast_asid(mm)) { ++ u16 asid = mm_broadcast_asid(mm); ++ /* ++ * Queue up an asynchronous invalidation. The corresponding ++ * TLBSYNC is done in arch_tlbbatch_flush(), and must be done ++ * on the same CPU. ++ */ ++ if (!batch->used_invlpgb) { ++ batch->used_invlpgb = true; ++ migrate_disable(); ++ } ++ invlpgb_flush_user_nr(kern_pcid(asid), uaddr, 1, 0); ++ /* Do any CPUs supporting INVLPGB need PTI? */ ++ if (static_cpu_has(X86_FEATURE_PTI)) ++ invlpgb_flush_user_nr(user_pcid(asid), uaddr, 1, 0); ++ } else { ++ inc_mm_tlb_gen(mm); ++ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); ++ } ++ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); ++} ++ + /* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or diff --git a/debian/patches/patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch b/debian/patches/patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch new file mode 100644 index 0000000..b72f506 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch @@ -0,0 +1,82 @@ +From 13faf551d1a146ed18c448babe1953def4ed3d56 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:12 -0500 +Subject: x86/mm: enable AMD translation cache extensions + +With AMD TCE (translation cache extensions) only the intermediate mappings +that cover the address range zapped by INVLPG / INVLPGB get invalidated, +rather than all intermediate mappings getting zapped at every TLB invalidation. + +This can help reduce the TLB miss rate, by keeping more intermediate +mappings in the cache. + +>From the AMD manual: + +Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit +to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on +TLB entries. When this bit is 0, these instructions remove the target PTE +from the TLB as well as all upper-level table entries that are cached +in the TLB, whether or not they are associated with the target PTE. +When this bit is set, these instructions will remove the target PTE and +only those upper-level entries that lead to the target PTE in +the page table hierarchy, leaving unrelated upper-level entries intact. + +Signed-off-by: Rik van Riel +--- + arch/x86/kernel/cpu/amd.c | 8 ++++++++ + arch/x86/mm/tlb.c | 10 +++++++--- + 2 files changed, 15 insertions(+), 3 deletions(-) + +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -1143,6 +1143,14 @@ static void cpu_detect_tlb_amd(struct cp + + /* Max number of pages INVLPGB can invalidate in one shot */ + invlpgb_count_max = (edx & 0xffff) + 1; ++ ++ /* If supported, enable translation cache extensions (TCE) */ ++ cpuid(0x80000001, &eax, &ebx, &ecx, &edx); ++ if (ecx & BIT(17)) { ++ u64 msr = native_read_msr(MSR_EFER);; ++ msr |= BIT(15); ++ wrmsrl(MSR_EFER, msr); ++ } + } + + static const struct cpu_dev amd_cpu_dev = { +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -477,7 +477,7 @@ static void broadcast_tlb_flush(struct f + if (info->stride_shift > PMD_SHIFT) + maxnr = 1; + +- if (info->end == TLB_FLUSH_ALL) { ++ if (info->end == TLB_FLUSH_ALL || info->freed_tables) { + invlpgb_flush_single_pcid(kern_pcid(asid)); + /* Do any CPUs supporting INVLPGB need PTI? */ + if (static_cpu_has(X86_FEATURE_PTI)) +@@ -1110,7 +1110,7 @@ static void flush_tlb_func(void *info) + * + * The only question is whether to do a full or partial flush. + * +- * We do a partial flush if requested and two extra conditions ++ * We do a partial flush if requested and three extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that +@@ -1137,10 +1137,14 @@ static void flush_tlb_func(void *info) + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. ++ * ++ * 3. No page tables were freed. If page tables were freed, a full ++ * flush ensures intermediate translations in the TLB get flushed. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && +- f->new_tlb_gen == mm_tlb_gen) { ++ f->new_tlb_gen == mm_tlb_gen && ++ !f->freed_tables) { + /* Partial flush */ + unsigned long addr = f->start; + diff --git a/debian/patches/patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch b/debian/patches/patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch new file mode 100644 index 0000000..7feb629 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch @@ -0,0 +1,28 @@ +From 2fc0be5fbcee1a62162b699451bb94f90ec64244 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 30 Dec 2024 12:53:13 -0500 +Subject: x86/mm: only invalidate final translations with INVLPGB + +Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB. +This way only leaf mappings get removed from the TLB, leaving intermediate +translations cached. + +On the (rare) occasions where we free page tables we do a full flush, +ensuring intermediate translations get flushed from the TLB. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/invlpgb.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/include/asm/invlpgb.h ++++ b/arch/x86/include/asm/invlpgb.h +@@ -51,7 +51,7 @@ static inline void invlpgb_flush_user(un + static inline void invlpgb_flush_user_nr(unsigned long pcid, unsigned long addr, + int nr, bool pmd_stride) + { +- __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); ++ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA | INVLPGB_FINAL_ONLY); + } + + /* Flush all mappings for a given ASID, not including globals. */ diff --git a/debian/patches/patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch b/debian/patches/patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch new file mode 100644 index 0000000..f2714b6 --- /dev/null +++ b/debian/patches/patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch @@ -0,0 +1,92 @@ +From a3ff46a157cadb29349c5b388fc70804c351e561 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Thu, 19 Dec 2024 15:32:53 -0500 +Subject: mm: remove unnecessary calls to lru_add_drain + +There seem to be several categories of calls to lru_add_drain +and lru_add_drain_all. + +The first are code paths that recently allocated, swapped in, +or otherwise processed a batch of pages, and want them all on +the LRU. These drain pages that were recently allocated, +probably on the local CPU. + +A second category are code paths that are actively trying to +reclaim, migrate, or offline memory. These often use lru_add_drain_all, +to drain the caches on all CPUs. + +However, there also seem to be some other callers where we +aren't really doing either. They are calling lru_add_drain(), +despite operating on pages that may have been allocated +long ago, and quite possibly on different CPUs. + +Those calls are not likely to be effective at anything but +creating lock contention on the LRU locks. + +Remove the lru_add_drain calls in the latter category. + +Signed-off-by: Rik van Riel +Suggested-by: David Hildenbrand +--- + mm/memory.c | 1 - + mm/mmap.c | 2 -- + mm/swap_state.c | 1 - + mm/vma.c | 2 -- + 4 files changed, 6 deletions(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are + struct mmu_notifier_range range; + struct mmu_gather tlb; + +- lru_add_drain(); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, + address, end); + hugetlb_zap_begin(vma, &range.start, &range.end); +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1927,7 +1927,6 @@ void exit_mmap(struct mm_struct *mm) + goto destroy; + } + +- lru_add_drain(); + flush_cache_mm(mm); + tlb_gather_mmu_fullmm(&tlb, mm); + /* update_hiwater_rss(mm) here? but nobody should be looking */ +@@ -2370,7 +2369,6 @@ int relocate_vma_down(struct vm_area_str + vma, new_start, length, false, true)) + return -ENOMEM; + +- lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); + if (new_end > old_start) { +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en + struct folio_batch folios; + unsigned int refs[PAGEVEC_SIZE]; + +- lru_add_drain(); + folio_batch_init(&folios); + for (int i = 0; i < nr; i++) { + struct folio *folio = page_folio(encoded_page_ptr(pages[i])); +--- a/mm/vma.c ++++ b/mm/vma.c +@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas, + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + +- lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + update_hiwater_rss(mm); + unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, +@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct + * were isolated before we downgraded mmap_lock. + */ + mas_set(mas_detach, 1); +- lru_add_drain(); + tlb_gather_mmu(&tlb, vms->vma->vm_mm); + update_hiwater_rss(vms->vma->vm_mm); + unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, diff --git a/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch b/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch new file mode 100644 index 0000000..45edbcc --- /dev/null +++ b/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch @@ -0,0 +1,27 @@ +From 6f554b20207f69146c07be3743b115e42f443627 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Fri, 27 Dec 2024 15:08:09 +0100 +Subject: drm/amdgpu: Add missing statement in resume_phase3 + +Fixes: 73dae652dcac776296890da215ee7dec357a1032 +See: https://gitlab.freedesktop.org/drm/amd/-/issues/3853#note_2714815 +For: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/101 +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 29b8346b..cbca5fa7 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -3723,6 +3723,7 @@ static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) + r = adev->ip_blocks[i].version->funcs->resume(adev); + if (r) + return r; ++ adev->ip_blocks[i].status.hw = true; + } + } + +-- +2.45.2 + diff --git a/debian/patches/series b/debian/patches/series index bcf56fc..ea49e6e 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -72,6 +72,7 @@ features/x86/x86-make-x32-syscall-support-conditional.patch bugfix/all/disable-some-marvell-phys.patch bugfix/all/fs-add-module_softdep-declarations-for-hard-coded-cr.patch bugfix/all/documentation-use-relative-source-paths-in-abi-documentation.patch +bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch # Miscellaneous features @@ -231,6 +232,20 @@ patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.pat patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch +patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch +patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch +patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch +patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch +patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch +patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch +patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch +patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch +patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch +patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch +patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch +patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch +patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch + patchset-pf/pksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch patchset-pf/xfs/0001-xfs-fix-chown-with-rt-quota.patch @@ -308,3 +323,4 @@ patchset-pf/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patc patchset-pf/fixes/0003-USB-core-Disable-LPM-only-for-non-suspended-ports.patch patchset-zen/fixes/0001-futex-improve-user-space-accesses.patch +patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch