diff --git a/debian/changelog b/debian/changelog index eb86be2..038ae8b 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +linux (6.12.11-1) sid; urgency=medium + + * New upstream stable update: + https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.12.11 + + -- Konstantin Demin Tue, 28 Jan 2025 09:10:17 +0300 + linux (6.12.10-1) sid; urgency=medium * New upstream stable update: diff --git a/debian/config/amd64/config.cloud b/debian/config/amd64/config.cloud index 135a2c1..0a21e5d 100644 --- a/debian/config/amd64/config.cloud +++ b/debian/config/amd64/config.cloud @@ -2491,8 +2491,6 @@ CONFIG_KEXEC_CORE=y CONFIG_LZ4HC_COMPRESS=m CONFIG_LZ4_COMPRESS=m CONFIG_MFD_CORE=m -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_MMU_GATHER_TABLE_FREE=y CONFIG_ND_BTT=m CONFIG_ND_PFN=m CONFIG_NETFS_SUPPORT=m diff --git a/debian/config/amd64/config.vm b/debian/config/amd64/config.vm index b880cfc..f8baf9b 100644 --- a/debian/config/amd64/config.vm +++ b/debian/config/amd64/config.vm @@ -4064,8 +4064,6 @@ CONFIG_LZ4_COMPRESS=m CONFIG_MAPPING_DIRTY_HELPERS=y CONFIG_MCTP_FLOWS=y CONFIG_MFD_CORE=m -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_MMU_GATHER_TABLE_FREE=y CONFIG_MOUSE_PS2_SMBUS=y CONFIG_ND_BTT=m CONFIG_ND_PFN=m diff --git a/debian/config/config b/debian/config/config index e169901..d4a6791 100644 --- a/debian/config/config +++ b/debian/config/config @@ -3945,6 +3945,8 @@ CONFIG_MLX4_CORE=m CONFIG_MMCONF_FAM10H=y CONFIG_MMU=y CONFIG_MMU_GATHER_MERGE_VMAS=y +CONFIG_MMU_GATHER_RCU_TABLE_FREE=y +CONFIG_MMU_GATHER_TABLE_FREE=y CONFIG_MMU_LAZY_TLB_REFCOUNT=y CONFIG_MMU_NOTIFIER=y CONFIG_MODULES_TREE_LOOKUP=y @@ -4186,6 +4188,7 @@ CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m CONFIG_X86=y CONFIG_X86_64=y CONFIG_X86_64_SMP=y +CONFIG_X86_BROADCAST_TLB_FLUSH=y CONFIG_X86_CMOV=y CONFIG_X86_CMPXCHG64=y CONFIG_X86_DEBUGCTLMSR=y diff --git a/debian/patches/debian/cdc_ncm-cdc_mbim-use-ncm-by-default.patch b/debian/patches/debian/cdc_ncm-cdc_mbim-use-ncm-by-default.patch deleted file mode 100644 index 21fcca4..0000000 --- a/debian/patches/debian/cdc_ncm-cdc_mbim-use-ncm-by-default.patch +++ /dev/null @@ -1,27 +0,0 @@ -From: Ben Hutchings -Subject: cdc_ncm,cdc_mbim: Use NCM by default -Date: Sun, 31 Mar 2013 03:58:04 +0100 -Forwarded: not-needed - -Devices that support both NCM and MBIM modes should be kept in NCM -mode unless there is userland support for MBIM. - -Set the default value of cdc_ncm.prefer_mbim to false and leave it to -userland (modem-manager) to override this with a modprobe.conf file -once it's ready to speak MBIM. - ---- ---- a/drivers/net/usb/cdc_ncm.c -+++ b/drivers/net/usb/cdc_ncm.c -@@ -54,11 +54,7 @@ - #include - #include - --#if IS_ENABLED(CONFIG_USB_NET_CDC_MBIM) --static bool prefer_mbim = true; --#else - static bool prefer_mbim; --#endif - module_param(prefer_mbim, bool, 0644); - MODULE_PARM_DESC(prefer_mbim, "Prefer MBIM setting on dual NCM/MBIM functions"); - diff --git a/debian/patches/debian/export-symbols-needed-by-android-drivers.patch b/debian/patches/debian/export-symbols-needed-by-android-drivers.patch index 255d526..d724d48 100644 --- a/debian/patches/debian/export-symbols-needed-by-android-drivers.patch +++ b/debian/patches/debian/export-symbols-needed-by-android-drivers.patch @@ -22,7 +22,7 @@ Export the currently un-exported symbols it depends on. --- a/fs/file.c +++ b/fs/file.c -@@ -792,6 +792,7 @@ struct file *file_close_fd(unsigned int +@@ -793,6 +793,7 @@ struct file *file_close_fd(unsigned int return file; } diff --git a/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch b/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch index 2cda6f9..fadbf0b 100644 --- a/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch +++ b/debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch @@ -47,7 +47,7 @@ Signed-off-by: Christian Loehle .reflect = menu_reflect, --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c -@@ -537,7 +537,7 @@ static int teo_enable_device(struct cpui +@@ -542,7 +542,7 @@ static int teo_enable_device(struct cpui static struct cpuidle_governor teo_governor = { .name = "teo", diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch b/debian/patches/patchset-zen/nvlpgb-v7/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch new file mode 100644 index 0000000..c4e69c0 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch @@ -0,0 +1,123 @@ +From 6cb30d7518301094b9c7397a24a22cf538a1d64c Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:20 -0500 +Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional + +Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using +paravirt, and not when running on bare metal. + +There is no real good reason to do things differently for +each setup. Make them all the same. + +Currently get_user_pages_fast synchronizes against page table +freeing in two different ways: +- on bare metal, by blocking IRQs, which block TLB flush IPIs +- on paravirt, with MMU_GATHER_RCU_TABLE_FREE + +This is done because some paravirt TLB flush implementations +handle the TLB flush in the hypervisor, and will do the flush +even when the target CPU has interrupts disabled. + +Always handle page table freeing with MMU_GATHER_RCU_TABLE_FREE. +Using RCU synchronization between page table freeing and get_user_pages_fast() +allows bare metal to also do TLB flushing while interrupts are disabled. + +Various places in the mm do still block IRQs or disable preemption +as an implicit way to block RCU frees. + +That makes it safe to use INVLPGB on AMD CPUs. + +Signed-off-by: Rik van Riel +Suggested-by: Peter Zijlstra +--- + arch/x86/Kconfig | 2 +- + arch/x86/kernel/paravirt.c | 7 +------ + arch/x86/mm/pgtable.c | 16 ++++------------ + 3 files changed, 6 insertions(+), 19 deletions(-) + +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -270,7 +270,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP +- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT ++ select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS + select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_REGS_AND_STACK_ACCESS_API +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void) + static_branch_enable(&virt_spin_lock_key); + } + +-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- tlb_remove_page(tlb, table); +-} +- + struct static_key paravirt_steal_enabled; + struct static_key paravirt_steal_rq_enabled; + +@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops = + .mmu.flush_tlb_kernel = native_flush_tlb_global, + .mmu.flush_tlb_one_user = native_flush_tlb_one_user, + .mmu.flush_tlb_multi = native_flush_tlb_multi, +- .mmu.tlb_remove_table = native_tlb_remove_table, ++ .mmu.tlb_remove_table = tlb_remove_table, + + .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask); + #define PGTABLE_HIGHMEM 0 + #endif + +-#ifndef CONFIG_PARAVIRT +-static inline +-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- tlb_remove_page(tlb, table); +-} +-#endif +- + gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; + + pgtable_t pte_alloc_one(struct mm_struct *mm) +@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather * + { + pagetable_pte_dtor(page_ptdesc(pte)); + paravirt_release_pte(page_to_pfn(pte)); +- paravirt_tlb_remove_table(tlb, pte); ++ tlb_remove_table(tlb, pte); + } + + #if CONFIG_PGTABLE_LEVELS > 2 +@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather * + tlb->need_flush_all = 1; + #endif + pagetable_pmd_dtor(ptdesc); +- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); ++ tlb_remove_table(tlb, ptdesc_page(ptdesc)); + } + + #if CONFIG_PGTABLE_LEVELS > 3 +@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather * + + pagetable_pud_dtor(ptdesc); + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); +- paravirt_tlb_remove_table(tlb, virt_to_page(pud)); ++ tlb_remove_table(tlb, virt_to_page(pud)); + } + + #if CONFIG_PGTABLE_LEVELS > 4 + void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) + { + paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); +- paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); ++ tlb_remove_table(tlb, virt_to_page(p4d)); + } + #endif /* CONFIG_PGTABLE_LEVELS > 4 */ + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch b/debian/patches/patchset-zen/nvlpgb-v7/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch new file mode 100644 index 0000000..4db4939 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch @@ -0,0 +1,84 @@ +From df8f812b62c450b98b972ad0a4d5a5ba400f5eae Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:21 -0500 +Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call + +Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table. + +Get rid of the indirection by simply calling tlb_remove_table directly, +and not going through the paravirt function pointers. + +Signed-off-by: Rik van Riel +Suggested-by: Qi Zheng +--- + arch/x86/hyperv/mmu.c | 1 - + arch/x86/include/asm/paravirt.h | 5 ----- + arch/x86/include/asm/paravirt_types.h | 2 -- + arch/x86/kernel/kvm.c | 1 - + arch/x86/kernel/paravirt.c | 1 - + arch/x86/xen/mmu_pv.c | 1 - + 6 files changed, 11 deletions(-) + +--- a/arch/x86/hyperv/mmu.c ++++ b/arch/x86/hyperv/mmu.c +@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void) + + pr_info("Using hypercall for remote TLB flush\n"); + pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; +- pv_ops.mmu.tlb_remove_table = tlb_remove_table; + } +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con + PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); + } + +-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); +-} +- + static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) + { + PVOP_VCALL1(mmu.exit_mmap, mm); +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -136,8 +136,6 @@ struct pv_mmu_ops { + void (*flush_tlb_multi)(const struct cpumask *cpus, + const struct flush_tlb_info *info); + +- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); +- + /* Hook for intercepting the destruction of an mm_struct. */ + void (*exit_mmap)(struct mm_struct *mm); + void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void) + #ifdef CONFIG_SMP + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; +- pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); + } + +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops = + .mmu.flush_tlb_kernel = native_flush_tlb_global, + .mmu.flush_tlb_one_user = native_flush_tlb_one_user, + .mmu.flush_tlb_multi = native_flush_tlb_multi, +- .mmu.tlb_remove_table = tlb_remove_table, + + .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -2137,7 +2137,6 @@ static const typeof(pv_ops) xen_mmu_ops + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_one_user = xen_flush_tlb_one_user, + .flush_tlb_multi = xen_flush_tlb_multi, +- .tlb_remove_table = tlb_remove_table, + + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0003-x86-mm-consolidate-full-flush-threshold-decision.patch b/debian/patches/patchset-zen/nvlpgb-v7/0003-x86-mm-consolidate-full-flush-threshold-decision.patch new file mode 100644 index 0000000..1e0ea75 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0003-x86-mm-consolidate-full-flush-threshold-decision.patch @@ -0,0 +1,93 @@ +From 8b2bd3f69b50cfe59eee4506413715878bcbb901 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:22 -0500 +Subject: x86/mm: consolidate full flush threshold decision + +Reduce code duplication by consolidating the decision point +for whether to do individual invalidations or a full flush +inside get_flush_tlb_info. + +Signed-off-by: Rik van Riel +Suggested-by: Dave Hansen +--- + arch/x86/mm/tlb.c | 43 ++++++++++++++++++++----------------------- + 1 file changed, 20 insertions(+), 23 deletions(-) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -981,6 +981,15 @@ static struct flush_tlb_info *get_flush_ + info->new_tlb_gen = new_tlb_gen; + info->initiating_cpu = smp_processor_id(); + ++ /* ++ * If the number of flushes is so large that a full flush ++ * would be faster, do a full flush. ++ */ ++ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) { ++ info->start = 0; ++ info->end = TLB_FLUSH_ALL; ++ } ++ + return info; + } + +@@ -998,17 +1007,8 @@ void flush_tlb_mm_range(struct mm_struct + bool freed_tables) + { + struct flush_tlb_info *info; ++ int cpu = get_cpu(); + u64 new_tlb_gen; +- int cpu; +- +- cpu = get_cpu(); +- +- /* Should we flush just the requested range? */ +- if ((end == TLB_FLUSH_ALL) || +- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { +- start = 0; +- end = TLB_FLUSH_ALL; +- } + + /* This is also a barrier that synchronizes with switch_mm(). */ + new_tlb_gen = inc_mm_tlb_gen(mm); +@@ -1060,22 +1060,19 @@ static void do_kernel_range_flush(void * + + void flush_tlb_kernel_range(unsigned long start, unsigned long end) + { +- /* Balance as user space task's flush, a bit conservative */ +- if (end == TLB_FLUSH_ALL || +- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { +- on_each_cpu(do_flush_tlb_all, NULL, 1); +- } else { +- struct flush_tlb_info *info; ++ struct flush_tlb_info *info; + +- preempt_disable(); +- info = get_flush_tlb_info(NULL, start, end, 0, false, +- TLB_GENERATION_INVALID); ++ guard(preempt)(); ++ ++ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false, ++ TLB_GENERATION_INVALID); + ++ if (info->end == TLB_FLUSH_ALL) ++ on_each_cpu(do_flush_tlb_all, NULL, 1); ++ else + on_each_cpu(do_kernel_range_flush, info, 1); + +- put_flush_tlb_info(); +- preempt_enable(); +- } ++ put_flush_tlb_info(); + } + + /* +@@ -1247,7 +1244,7 @@ void arch_tlbbatch_flush(struct arch_tlb + + int cpu = get_cpu(); + +- info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, ++ info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, PAGE_SHIFT, false, + TLB_GENERATION_INVALID); + /* + * flush_tlb_multi() is not optimized for the common case in which only diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch b/debian/patches/patchset-zen/nvlpgb-v7/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch new file mode 100644 index 0000000..683c3d1 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch @@ -0,0 +1,89 @@ +From a182b0471ba3c3329d93abfa07e3d452183a9137 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:23 -0500 +Subject: x86/mm: get INVLPGB count max from CPUID + +The CPU advertises the maximum number of pages that can be shot down +with one INVLPGB instruction in the CPUID data. + +Save that information for later use. + +Signed-off-by: Rik van Riel +--- + arch/x86/Kconfig.cpu | 5 +++++ + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/tlbflush.h | 7 +++++++ + arch/x86/kernel/cpu/amd.c | 8 ++++++++ + 4 files changed, 21 insertions(+) + +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -726,6 +726,10 @@ config X86_VMX_FEATURE_NAMES + def_bool y + depends on IA32_FEAT_CTL + ++config X86_BROADCAST_TLB_FLUSH ++ def_bool y ++ depends on CPU_SUP_AMD && 64BIT ++ + menuconfig PROCESSOR_SELECT + bool "Supported processor vendors" if EXPERT + help +@@ -762,6 +766,7 @@ config CPU_SUP_CYRIX_32 + config CPU_SUP_AMD + default y + bool "Support AMD processors" if PROCESSOR_SELECT ++ select X86_BROADCAST_TLB_FLUSH + help + This enables detection, tunings and quirks for AMD processors + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -335,6 +335,7 @@ + #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ + #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ ++#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instruction supported. */ + #define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ + #define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ + #define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -183,6 +183,13 @@ static inline void cr4_init_shadow(void) + extern unsigned long mmu_cr4_features; + extern u32 *trampoline_cr4_features; + ++/* How many pages can we invalidate with one INVLPGB. */ ++#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH ++extern u16 invlpgb_count_max; ++#else ++#define invlpgb_count_max 1 ++#endif ++ + extern void initialize_tlbstate_and_flush(void); + + /* +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -29,6 +29,8 @@ + + #include "cpu.h" + ++u16 invlpgb_count_max __ro_after_init; ++ + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) + { + u32 gprs[8] = { 0 }; +@@ -1135,6 +1137,12 @@ static void cpu_detect_tlb_amd(struct cp + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; ++ ++ /* Max number of pages INVLPGB can invalidate in one shot */ ++ if (boot_cpu_has(X86_FEATURE_INVLPGB)) { ++ cpuid(0x80000008, &eax, &ebx, &ecx, &edx); ++ invlpgb_count_max = (edx & 0xffff) + 1; ++ } + } + + static const struct cpu_dev amd_cpu_dev = { diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0005-x86-mm-add-INVLPGB-support-code.patch b/debian/patches/patchset-zen/nvlpgb-v7/0005-x86-mm-add-INVLPGB-support-code.patch new file mode 100644 index 0000000..b67e883 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0005-x86-mm-add-INVLPGB-support-code.patch @@ -0,0 +1,129 @@ +From cc3f8dd3033c79abd9f37a94efed74a535a703c9 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:24 -0500 +Subject: x86/mm: add INVLPGB support code + +Add invlpgb.h with the helper functions and definitions needed to use +broadcast TLB invalidation on AMD EPYC 3 and newer CPUs. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/invlpgb.h | 101 ++++++++++++++++++++++++++++++++ + arch/x86/include/asm/tlbflush.h | 1 + + 2 files changed, 102 insertions(+) + create mode 100644 arch/x86/include/asm/invlpgb.h + +--- /dev/null ++++ b/arch/x86/include/asm/invlpgb.h +@@ -0,0 +1,101 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_X86_INVLPGB ++#define _ASM_X86_INVLPGB ++ ++#include ++#include ++#include ++ ++/* ++ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. ++ * ++ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can ++ * be done in a parallel fashion. ++ * ++ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from ++ * this CPU have completed. ++ */ ++static inline void __invlpgb(unsigned long asid, unsigned long pcid, ++ unsigned long addr, u16 extra_count, ++ bool pmd_stride, u8 flags) ++{ ++ u32 edx = (pcid << 16) | asid; ++ u32 ecx = (pmd_stride << 31) | extra_count; ++ u64 rax = addr | flags; ++ ++ /* The low bits in rax are for flags. Verify addr is clean. */ ++ VM_WARN_ON_ONCE(addr & ~PAGE_MASK); ++ ++ /* INVLPGB; supported in binutils >= 2.36. */ ++ asm volatile(".byte 0x0f, 0x01, 0xfe" : : "a" (rax), "c" (ecx), "d" (edx)); ++} ++ ++/* Wait for INVLPGB originated by this CPU to complete. */ ++static inline void tlbsync(void) ++{ ++ cant_migrate(); ++ /* TLBSYNC: supported in binutils >= 0.36. */ ++ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); ++} ++ ++/* ++ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination ++ * of the three. For example: ++ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address ++ * - INVLPGB_PCID: invalidate all TLB entries matching the PCID ++ * ++ * The first can be used to invalidate (kernel) mappings at a particular ++ * address across all processes. ++ * ++ * The latter invalidates all TLB entries matching a PCID. ++ */ ++#define INVLPGB_VA BIT(0) ++#define INVLPGB_PCID BIT(1) ++#define INVLPGB_ASID BIT(2) ++#define INVLPGB_INCLUDE_GLOBAL BIT(3) ++#define INVLPGB_FINAL_ONLY BIT(4) ++#define INVLPGB_INCLUDE_NESTED BIT(5) ++ ++/* Flush all mappings for a given pcid and addr, not including globals. */ ++static inline void invlpgb_flush_user(unsigned long pcid, ++ unsigned long addr) ++{ ++ __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA); ++ tlbsync(); ++} ++ ++static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, ++ unsigned long addr, ++ u16 nr, ++ bool pmd_stride) ++{ ++ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); ++} ++ ++/* Flush all mappings for a given PCID, not including globals. */ ++static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) ++{ ++ __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID); ++} ++ ++/* Flush all mappings, including globals, for all PCIDs. */ ++static inline void invlpgb_flush_all(void) ++{ ++ __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL); ++ tlbsync(); ++} ++ ++/* Flush addr, including globals, for all PCIDs. */ ++static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) ++{ ++ __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL); ++} ++ ++/* Flush all mappings for all PCIDs except globals. */ ++static inline void invlpgb_flush_all_nonglobals(void) ++{ ++ __invlpgb(0, 0, 0, 0, 0, 0); ++ tlbsync(); ++} ++ ++#endif /* _ASM_X86_INVLPGB */ +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include + #include diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch b/debian/patches/patchset-zen/nvlpgb-v7/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch new file mode 100644 index 0000000..23ebdc3 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch @@ -0,0 +1,58 @@ +From 6b6686f0d7e228d0a2d8c166204adea5484c20d7 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:25 -0500 +Subject: x86/mm: use INVLPGB for kernel TLB flushes + +Use broadcast TLB invalidation for kernel addresses when available. + +Remove the need to send IPIs for kernel TLB flushes. + +Signed-off-by: Rik van Riel +--- + arch/x86/mm/tlb.c | 28 +++++++++++++++++++++++++++- + 1 file changed, 27 insertions(+), 1 deletion(-) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1048,6 +1048,30 @@ void flush_tlb_all(void) + on_each_cpu(do_flush_tlb_all, NULL, 1); + } + ++static bool broadcast_kernel_range_flush(struct flush_tlb_info *info) ++{ ++ unsigned long addr; ++ unsigned long nr; ++ ++ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH)) ++ return false; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return false; ++ ++ if (info->end == TLB_FLUSH_ALL) { ++ invlpgb_flush_all(); ++ return true; ++ } ++ ++ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) { ++ nr = min((info->end - addr) >> PAGE_SHIFT, invlpgb_count_max); ++ invlpgb_flush_addr_nosync(addr, nr); ++ } ++ tlbsync(); ++ return true; ++} ++ + static void do_kernel_range_flush(void *info) + { + struct flush_tlb_info *f = info; +@@ -1067,7 +1091,9 @@ void flush_tlb_kernel_range(unsigned lon + info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false, + TLB_GENERATION_INVALID); + +- if (info->end == TLB_FLUSH_ALL) ++ if (broadcast_kernel_range_flush(info)) ++ ; /* Fall through. */ ++ else if (info->end == TLB_FLUSH_ALL) + on_each_cpu(do_flush_tlb_all, NULL, 1); + else + on_each_cpu(do_kernel_range_flush, info, 1); diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch b/debian/patches/patchset-zen/nvlpgb-v7/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch new file mode 100644 index 0000000..6b7a571 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch @@ -0,0 +1,44 @@ +From 6cffce503223f9076a5e16177905ba3ab6d9f7d8 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:26 -0500 +Subject: x86/mm: use INVLPGB in flush_tlb_all + +The flush_tlb_all() function is not used a whole lot, but we might +as well use broadcast TLB flushing there, too. + +Signed-off-by: Rik van Riel +--- + arch/x86/mm/tlb.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1036,6 +1036,19 @@ void flush_tlb_mm_range(struct mm_struct + } + + ++static bool broadcast_flush_tlb_all(void) ++{ ++ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH)) ++ return false; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return false; ++ ++ guard(preempt)(); ++ invlpgb_flush_all(); ++ return true; ++} ++ + static void do_flush_tlb_all(void *info) + { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); +@@ -1044,6 +1057,8 @@ static void do_flush_tlb_all(void *info) + + void flush_tlb_all(void) + { ++ if (broadcast_flush_tlb_all()) ++ return; + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + on_each_cpu(do_flush_tlb_all, NULL, 1); + } diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch b/debian/patches/patchset-zen/nvlpgb-v7/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch new file mode 100644 index 0000000..8191576 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch @@ -0,0 +1,29 @@ +From 3d23d79d14cdd3c68dc5bffbaf34a60eaca7fa40 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:27 -0500 +Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing + +In the page reclaim code, we only track the CPU(s) where the TLB needs +to be flushed, rather than all the individual mappings that may be getting +invalidated. + +Use broadcast TLB flushing when that is available. + +Signed-off-by: Rik van Riel +--- + arch/x86/mm/tlb.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1292,7 +1292,9 @@ void arch_tlbbatch_flush(struct arch_tlb + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ +- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ invlpgb_flush_all_nonglobals(); ++ } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + flush_tlb_multi(&batch->cpumask, info); + } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { + lockdep_assert_irqs_enabled(); diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch b/debian/patches/patchset-zen/nvlpgb-v7/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch new file mode 100644 index 0000000..41247f5 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch @@ -0,0 +1,602 @@ +From 79c9df0c7637c8ba8a1833889a2ace355d56c96e Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:28 -0500 +Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded + processes + +Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3 +and newer CPUs. + +In order to not exhaust PCID space, and keep TLB flushes local for single +threaded processes, we only hand out broadcast ASIDs to processes active on +3 or more CPUs, and gradually increase the threshold as broadcast ASID space +is depleted. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/mmu.h | 6 + + arch/x86/include/asm/mmu_context.h | 14 ++ + arch/x86/include/asm/tlbflush.h | 73 ++++++ + arch/x86/mm/tlb.c | 344 ++++++++++++++++++++++++++++- + 4 files changed, 425 insertions(+), 12 deletions(-) + +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -67,6 +67,12 @@ typedef struct { + u16 pkey_allocation_map; + s16 execute_only_pkey; + #endif ++ ++#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH ++ u16 global_asid; ++ bool asid_transition; ++#endif ++ + } mm_context_t; + + #define INIT_MM_CONTEXT(mm) \ +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s + #define enter_lazy_tlb enter_lazy_tlb + extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); + ++extern void destroy_context_free_global_asid(struct mm_struct *mm); ++ + /* + * Init a new mm. Used on mm copies, like at fork() + * and on mm's that are brand-new, like at execve(). +@@ -160,6 +162,14 @@ static inline int init_new_context(struc + mm->context.execute_only_pkey = -1; + } + #endif ++ ++#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ mm->context.global_asid = 0; ++ mm->context.asid_transition = false; ++ } ++#endif ++ + mm_reset_untag_mask(mm); + init_new_context_ldt(mm); + return 0; +@@ -169,6 +179,10 @@ static inline int init_new_context(struc + static inline void destroy_context(struct mm_struct *mm) + { + destroy_context_ldt(mm); ++#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ destroy_context_free_global_asid(mm); ++#endif + } + + extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -6,6 +6,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -238,6 +239,78 @@ void flush_tlb_one_kernel(unsigned long + void flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info); + ++#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH ++static inline bool is_dyn_asid(u16 asid) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return true; ++ ++ return asid < TLB_NR_DYN_ASIDS; ++} ++ ++static inline bool is_global_asid(u16 asid) ++{ ++ return !is_dyn_asid(asid); ++} ++ ++static inline bool in_asid_transition(const struct flush_tlb_info *info) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return false; ++ ++ return info->mm && READ_ONCE(info->mm->context.asid_transition); ++} ++ ++static inline u16 mm_global_asid(struct mm_struct *mm) ++{ ++ u16 asid; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return 0; ++ ++ asid = smp_load_acquire(&mm->context.global_asid); ++ ++ /* mm->context.global_asid is either 0, or a global ASID */ ++ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid)); ++ ++ return asid; ++} ++#else ++static inline bool is_dyn_asid(u16 asid) ++{ ++ return true; ++} ++ ++static inline bool is_global_asid(u16 asid) ++{ ++ return false; ++} ++ ++static inline bool in_asid_transition(const struct flush_tlb_info *info) ++{ ++ return false; ++} ++ ++static inline u16 mm_global_asid(struct mm_struct *mm) ++{ ++ return 0; ++} ++ ++static inline bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid) ++{ ++ return false; ++} ++ ++static inline void broadcast_tlb_flush(struct flush_tlb_info *info) ++{ ++ VM_WARN_ON_ONCE(1); ++} ++ ++static inline void consider_global_asid(struct mm_struct *mm) ++{ ++} ++#endif ++ + #ifdef CONFIG_PARAVIRT + #include + #endif +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -74,13 +74,15 @@ + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] +- * the canonical identifier for an mm ++ * the canonical identifier for an mm, dynamically allocated on each CPU ++ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1] ++ * the canonical, global identifier for an mm, identical across all CPUs + * +- * kPCID - [1, TLB_NR_DYN_ASIDS] ++ * kPCID - [1, MAX_ASID_AVAILABLE] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * +- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] ++ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. +@@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_st + return; + } + ++ /* ++ * TLB consistency for global ASIDs is maintained with broadcast TLB ++ * flushing. The TLB is never outdated, and does not need flushing. ++ */ ++ if (IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH) && static_cpu_has(X86_FEATURE_INVLPGB)) { ++ u16 global_asid = mm_global_asid(next); ++ ++ if (global_asid) { ++ *new_asid = global_asid; ++ *need_flush = false; ++ return; ++ } ++ } ++ + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + +@@ -251,6 +267,272 @@ static void choose_new_asid(struct mm_st + *need_flush = true; + } + ++#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH ++/* ++ * Logic for broadcast TLB invalidation. ++ */ ++static DEFINE_RAW_SPINLOCK(global_asid_lock); ++static u16 last_global_asid = MAX_ASID_AVAILABLE; ++static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE) = { 0 }; ++static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE) = { 0 }; ++static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1; ++ ++static void reset_global_asid_space(void) ++{ ++ lockdep_assert_held(&global_asid_lock); ++ ++ /* ++ * A global TLB flush guarantees that any stale entries from ++ * previously freed global ASIDs get flushed from the TLB ++ * everywhere, making these global ASIDs safe to reuse. ++ */ ++ invlpgb_flush_all_nonglobals(); ++ ++ /* ++ * Clear all the previously freed global ASIDs from the ++ * broadcast_asid_used bitmap, now that the global TLB flush ++ * has made them actually available for re-use. ++ */ ++ bitmap_andnot(global_asid_used, global_asid_used, ++ global_asid_freed, MAX_ASID_AVAILABLE); ++ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE); ++ ++ /* ++ * ASIDs 0-TLB_NR_DYN_ASIDS are used for CPU-local ASID ++ * assignments, for tasks doing IPI based TLB shootdowns. ++ * Restart the search from the start of the global ASID space. ++ */ ++ last_global_asid = TLB_NR_DYN_ASIDS; ++} ++ ++static u16 get_global_asid(void) ++{ ++ ++ u16 asid; ++ ++ lockdep_assert_held(&global_asid_lock); ++ ++ /* The previous allocated ASID is at the top of the address space. */ ++ if (last_global_asid >= MAX_ASID_AVAILABLE - 1) ++ reset_global_asid_space(); ++ ++ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid); ++ ++ if (asid >= MAX_ASID_AVAILABLE) { ++ /* This should never happen. */ ++ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", global_asid_available); ++ return 0; ++ } ++ ++ /* Claim this global ASID. */ ++ __set_bit(asid, global_asid_used); ++ last_global_asid = asid; ++ global_asid_available--; ++ return asid; ++} ++ ++/* ++ * Returns true if the mm is transitioning from a CPU-local ASID to a global ++ * (INVLPGB) ASID, or the other way around. ++ */ ++static bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid) ++{ ++ u16 global_asid = mm_global_asid(next); ++ ++ if (global_asid && prev_asid != global_asid) ++ return true; ++ ++ if (!global_asid && is_global_asid(prev_asid)) ++ return true; ++ ++ return false; ++} ++ ++void destroy_context_free_global_asid(struct mm_struct *mm) ++{ ++ if (!mm->context.global_asid) ++ return; ++ ++ guard(raw_spinlock_irqsave)(&global_asid_lock); ++ ++ /* The global ASID can be re-used only after flush at wrap-around. */ ++ __set_bit(mm->context.global_asid, global_asid_freed); ++ ++ mm->context.global_asid = 0; ++ global_asid_available++; ++} ++ ++/* ++ * Check whether a process is currently active on more than "threshold" CPUs. ++ * This is a cheap estimation on whether or not it may make sense to assign ++ * a global ASID to this process, and use broadcast TLB invalidation. ++ */ ++static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold) ++{ ++ int count = 0; ++ int cpu; ++ ++ /* This quick check should eliminate most single threaded programs. */ ++ if (cpumask_weight(mm_cpumask(mm)) <= threshold) ++ return false; ++ ++ /* Slower check to make sure. */ ++ for_each_cpu(cpu, mm_cpumask(mm)) { ++ /* Skip the CPUs that aren't really running this process. */ ++ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm) ++ continue; ++ ++ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) ++ continue; ++ ++ if (++count > threshold) ++ return true; ++ } ++ return false; ++} ++ ++/* ++ * Assign a global ASID to the current process, protecting against ++ * races between multiple threads in the process. ++ */ ++static void use_global_asid(struct mm_struct *mm) ++{ ++ u16 asid; ++ ++ guard(raw_spinlock_irqsave)(&global_asid_lock); ++ ++ /* This process is already using broadcast TLB invalidation. */ ++ if (mm->context.global_asid) ++ return; ++ ++ /* The last global ASID was consumed while waiting for the lock. */ ++ if (!global_asid_available) { ++ VM_WARN_ONCE(1, "Ran out of global ASIDs\n"); ++ return; ++ } ++ ++ asid = get_global_asid(); ++ if (!asid) ++ return; ++ ++ /* ++ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() -> ++ * finish_asid_transition() needs to observe asid_transition = true ++ * once it observes global_asid. ++ */ ++ mm->context.asid_transition = true; ++ smp_store_release(&mm->context.global_asid, asid); ++} ++ ++static bool meets_global_asid_threshold(struct mm_struct *mm) ++{ ++ if (!global_asid_available) ++ return false; ++ ++ /* ++ * Assign a global ASID if the process is active on ++ * 4 or more CPUs simultaneously. ++ */ ++ return mm_active_cpus_exceeds(mm, 3); ++} ++ ++static void consider_global_asid(struct mm_struct *mm) ++{ ++ if (!static_cpu_has(X86_FEATURE_INVLPGB)) ++ return; ++ ++ /* Check every once in a while. */ ++ if ((current->pid & 0x1f) != (jiffies & 0x1f)) ++ return; ++ ++ if (meets_global_asid_threshold(mm)) ++ use_global_asid(mm); ++} ++ ++static void finish_asid_transition(struct flush_tlb_info *info) ++{ ++ struct mm_struct *mm = info->mm; ++ int bc_asid = mm_global_asid(mm); ++ int cpu; ++ ++ if (!READ_ONCE(mm->context.asid_transition)) ++ return; ++ ++ for_each_cpu(cpu, mm_cpumask(mm)) { ++ /* ++ * The remote CPU is context switching. Wait for that to ++ * finish, to catch the unlikely case of it switching to ++ * the target mm with an out of date ASID. ++ */ ++ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING) ++ cpu_relax(); ++ ++ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm) ++ continue; ++ ++ /* ++ * If at least one CPU is not using the global ASID yet, ++ * send a TLB flush IPI. The IPI should cause stragglers ++ * to transition soon. ++ * ++ * This can race with the CPU switching to another task; ++ * that results in a (harmless) extra IPI. ++ */ ++ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) { ++ flush_tlb_multi(mm_cpumask(info->mm), info); ++ return; ++ } ++ } ++ ++ /* All the CPUs running this process are using the global ASID. */ ++ WRITE_ONCE(mm->context.asid_transition, false); ++} ++ ++static void broadcast_tlb_flush(struct flush_tlb_info *info) ++{ ++ bool pmd = info->stride_shift == PMD_SHIFT; ++ unsigned long maxnr = invlpgb_count_max; ++ unsigned long asid = info->mm->context.global_asid; ++ unsigned long addr = info->start; ++ unsigned long nr; ++ ++ /* Flushing multiple pages at once is not supported with 1GB pages. */ ++ if (info->stride_shift > PMD_SHIFT) ++ maxnr = 1; ++ ++ /* ++ * TLB flushes with INVLPGB are kicked off asynchronously. ++ * The inc_mm_tlb_gen() guarantees page table updates are done ++ * before these TLB flushes happen. ++ */ ++ if (info->end == TLB_FLUSH_ALL) { ++ invlpgb_flush_single_pcid_nosync(kern_pcid(asid)); ++ /* Do any CPUs supporting INVLPGB need PTI? */ ++ if (static_cpu_has(X86_FEATURE_PTI)) ++ invlpgb_flush_single_pcid_nosync(user_pcid(asid)); ++ } else do { ++ /* ++ * Calculate how many pages can be flushed at once; if the ++ * remainder of the range is less than one page, flush one. ++ */ ++ nr = min(maxnr, (info->end - addr) >> info->stride_shift); ++ nr = max(nr, 1); ++ ++ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); ++ /* Do any CPUs supporting INVLPGB need PTI? */ ++ if (static_cpu_has(X86_FEATURE_PTI)) ++ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd); ++ ++ addr += nr << info->stride_shift; ++ } while (addr < info->end); ++ ++ finish_asid_transition(info); ++ ++ /* Wait for the INVLPGBs kicked off above to finish. */ ++ tlbsync(); ++} ++#endif /* CONFIG_X86_BROADCAST_TLB_FLUSH */ ++ + /* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. +@@ -556,8 +838,9 @@ void switch_mm_irqs_off(struct mm_struct + */ + if (prev == next) { + /* Not actually switching mm's */ +- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != +- next->context.ctx_id); ++ VM_WARN_ON(is_dyn_asid(prev_asid) && ++ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != ++ next->context.ctx_id); + + /* + * If this races with another thread that enables lam, 'new_lam' +@@ -574,6 +857,23 @@ void switch_mm_irqs_off(struct mm_struct + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* ++ * Check if the current mm is transitioning to a new ASID. ++ */ ++ if (needs_global_asid_reload(next, prev_asid)) { ++ next_tlb_gen = atomic64_read(&next->context.tlb_gen); ++ ++ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); ++ goto reload_tlb; ++ } ++ ++ /* ++ * Broadcast TLB invalidation keeps this PCID up to date ++ * all the time. ++ */ ++ if (is_global_asid(prev_asid)) ++ return; ++ ++ /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same + * process. No TLB flush required. +@@ -607,6 +907,13 @@ void switch_mm_irqs_off(struct mm_struct + cond_mitigation(tsk); + + /* ++ * Let nmi_uaccess_okay() and finish_asid_transition() ++ * know that we're changing CR3. ++ */ ++ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); ++ barrier(); ++ ++ /* + * Stop remote flushes for the previous mm. + * Skip kernel threads; we never send init_mm TLB flushing IPIs, + * but the bitmap manipulation can cause cache line contention. +@@ -623,14 +930,12 @@ void switch_mm_irqs_off(struct mm_struct + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); +- +- /* Let nmi_uaccess_okay() know that we're changing CR3. */ +- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); +- barrier(); + } + ++reload_tlb: + new_lam = mm_lam_cr3_mask(next); + if (need_flush) { ++ VM_WARN_ON_ONCE(is_global_asid(new_asid)); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, new_lam, true); +@@ -749,7 +1054,7 @@ static void flush_tlb_func(void *info) + const struct flush_tlb_info *f = info; + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); +- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); ++ u64 local_tlb_gen; + bool local = smp_processor_id() == f->initiating_cpu; + unsigned long nr_invalidate = 0; + u64 mm_tlb_gen; +@@ -769,6 +1074,16 @@ static void flush_tlb_func(void *info) + if (unlikely(loaded_mm == &init_mm)) + return; + ++ /* Reload the ASID if transitioning into or out of a global ASID */ ++ if (needs_global_asid_reload(loaded_mm, loaded_mm_asid)) { ++ switch_mm_irqs_off(NULL, loaded_mm, NULL); ++ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); ++ } ++ ++ /* Broadcast ASIDs are always kept up to date with INVLPGB. */ ++ if (is_global_asid(loaded_mm_asid)) ++ return; ++ + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + +@@ -786,6 +1101,8 @@ static void flush_tlb_func(void *info) + return; + } + ++ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); ++ + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && + f->new_tlb_gen <= local_tlb_gen)) { + /* +@@ -926,7 +1243,7 @@ STATIC_NOPV void native_flush_tlb_multi( + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ +- if (info->freed_tables) ++ if (info->freed_tables || in_asid_transition(info)) + on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); + else + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, +@@ -1021,8 +1338,11 @@ void flush_tlb_mm_range(struct mm_struct + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ +- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { ++ if (mm_global_asid(mm)) { ++ broadcast_tlb_flush(info); ++ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + flush_tlb_multi(mm_cpumask(mm), info); ++ consider_global_asid(mm); + } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + lockdep_assert_irqs_enabled(); + local_irq_disable(); diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch b/debian/patches/patchset-zen/nvlpgb-v7/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch new file mode 100644 index 0000000..002a364 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch @@ -0,0 +1,135 @@ +From 647727eaa06fc61fbc55de4c09ab0c0fe7bc7263 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:29 -0500 +Subject: x86/mm: do targeted broadcast flushing from tlbbatch code + +Instead of doing a system-wide TLB flush from arch_tlbbatch_flush, +queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending. + +This also allows us to avoid adding the CPUs of processes using broadcast +flushing to the batch->cpumask, and will hopefully further reduce TLB +flushing from the reclaim and compaction paths. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/tlbbatch.h | 1 + + arch/x86/include/asm/tlbflush.h | 12 ++----- + arch/x86/mm/tlb.c | 57 +++++++++++++++++++++++++++++++-- + 3 files changed, 58 insertions(+), 12 deletions(-) + +--- a/arch/x86/include/asm/tlbbatch.h ++++ b/arch/x86/include/asm/tlbbatch.h +@@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch { + * the PFNs being flushed.. + */ + struct cpumask cpumask; ++ bool used_invlpgb; + }; + + #endif /* _ARCH_X86_TLBBATCH_H */ +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -358,21 +358,15 @@ static inline u64 inc_mm_tlb_gen(struct + return atomic64_inc_return(&mm->context.tlb_gen); + } + +-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, +- struct mm_struct *mm, +- unsigned long uaddr) +-{ +- inc_mm_tlb_gen(mm); +- cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); +- mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); +-} +- + static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) + { + flush_tlb_mm(mm); + } + + extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); ++extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, ++ struct mm_struct *mm, ++ unsigned long uaddr); + + static inline bool pte_flags_need_flush(unsigned long oldflags, + unsigned long newflags, +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -1612,9 +1612,7 @@ void arch_tlbbatch_flush(struct arch_tlb + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ +- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { +- invlpgb_flush_all_nonglobals(); +- } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { ++ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + flush_tlb_multi(&batch->cpumask, info); + } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { + lockdep_assert_irqs_enabled(); +@@ -1623,12 +1621,65 @@ void arch_tlbbatch_flush(struct arch_tlb + local_irq_enable(); + } + ++ /* ++ * If we issued (asynchronous) INVLPGB flushes, wait for them here. ++ * The cpumask above contains only CPUs that were running tasks ++ * not using broadcast TLB flushing. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) { ++ tlbsync(); ++ migrate_enable(); ++ batch->used_invlpgb = false; ++ } ++ + cpumask_clear(&batch->cpumask); + + put_flush_tlb_info(); + put_cpu(); + } + ++void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, ++ struct mm_struct *mm, ++ unsigned long uaddr) ++{ ++ u16 asid = mm_global_asid(mm); ++ ++ if (asid) { ++ /* ++ * Queue up an asynchronous invalidation. The corresponding ++ * TLBSYNC is done in arch_tlbbatch_flush(), and must be done ++ * on the same CPU. ++ */ ++ if (!batch->used_invlpgb) { ++ batch->used_invlpgb = true; ++ migrate_disable(); ++ } ++ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false); ++ /* Do any CPUs supporting INVLPGB need PTI? */ ++ if (static_cpu_has(X86_FEATURE_PTI)) ++ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false); ++ ++ /* ++ * Some CPUs might still be using a local ASID for this ++ * process, and require IPIs, while others are using the ++ * global ASID. ++ * ++ * In this corner case we need to do both the broadcast ++ * TLB invalidation, and send IPIs. The IPIs will help ++ * stragglers transition to the broadcast ASID. ++ */ ++ if (READ_ONCE(mm->context.asid_transition)) ++ asid = 0; ++ } ++ ++ if (!asid) { ++ inc_mm_tlb_gen(mm); ++ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); ++ } ++ ++ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); ++} ++ + /* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0011-x86-mm-enable-AMD-translation-cache-extensions.patch b/debian/patches/patchset-zen/nvlpgb-v7/0011-x86-mm-enable-AMD-translation-cache-extensions.patch new file mode 100644 index 0000000..cf2f47b --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0011-x86-mm-enable-AMD-translation-cache-extensions.patch @@ -0,0 +1,79 @@ +From 0678da9f0870f0d211d49808a66e98abc0c58438 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:30 -0500 +Subject: x86/mm: enable AMD translation cache extensions + +With AMD TCE (translation cache extensions) only the intermediate mappings +that cover the address range zapped by INVLPG / INVLPGB get invalidated, +rather than all intermediate mappings getting zapped at every TLB invalidation. + +This can help reduce the TLB miss rate, by keeping more intermediate +mappings in the cache. + +From the AMD manual: + +Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit +to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on +TLB entries. When this bit is 0, these instructions remove the target PTE +from the TLB as well as all upper-level table entries that are cached +in the TLB, whether or not they are associated with the target PTE. +When this bit is set, these instructions will remove the target PTE and +only those upper-level entries that lead to the target PTE in +the page table hierarchy, leaving unrelated upper-level entries intact. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/msr-index.h | 2 ++ + arch/x86/kernel/cpu/amd.c | 4 ++++ + tools/arch/x86/include/asm/msr-index.h | 2 ++ + 3 files changed, 8 insertions(+) + +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -25,6 +25,7 @@ + #define _EFER_SVME 12 /* Enable virtualization */ + #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ + #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ ++#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ + #define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ + + #define EFER_SCE (1<<_EFER_SCE) +@@ -34,6 +35,7 @@ + #define EFER_SVME (1<<_EFER_SVME) + #define EFER_LMSLE (1<<_EFER_LMSLE) + #define EFER_FFXSR (1<<_EFER_FFXSR) ++#define EFER_TCE (1<<_EFER_TCE) + #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) + + /* +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -1071,6 +1071,10 @@ static void init_amd(struct cpuinfo_x86 + + /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); ++ ++ /* Enable Translation Cache Extension */ ++ if (cpu_feature_enabled(X86_FEATURE_TCE)) ++ msr_set_bit(MSR_EFER, _EFER_TCE); + } + + #ifdef CONFIG_X86_32 +--- a/tools/arch/x86/include/asm/msr-index.h ++++ b/tools/arch/x86/include/asm/msr-index.h +@@ -25,6 +25,7 @@ + #define _EFER_SVME 12 /* Enable virtualization */ + #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ + #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ ++#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ + #define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ + + #define EFER_SCE (1<<_EFER_SCE) +@@ -34,6 +35,7 @@ + #define EFER_SVME (1<<_EFER_SVME) + #define EFER_LMSLE (1<<_EFER_LMSLE) + #define EFER_FFXSR (1<<_EFER_FFXSR) ++#define EFER_TCE (1<<_EFER_TCE) + #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) + + /* diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch b/debian/patches/patchset-zen/nvlpgb-v7/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch new file mode 100644 index 0000000..7fe2b64 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch @@ -0,0 +1,66 @@ +From 02d1759eda082f9595f3232f5dffd5d49943924a Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Wed, 22 Jan 2025 23:23:31 -0500 +Subject: x86/mm: only invalidate final translations with INVLPGB + +Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB. +This way only leaf mappings get removed from the TLB, leaving intermediate +translations cached. + +On the (rare) occasions where we free page tables we do a full flush, +ensuring intermediate translations get flushed from the TLB. + +Signed-off-by: Rik van Riel +--- + arch/x86/include/asm/invlpgb.h | 10 ++++++++-- + arch/x86/mm/tlb.c | 8 ++++---- + 2 files changed, 12 insertions(+), 6 deletions(-) + +--- a/arch/x86/include/asm/invlpgb.h ++++ b/arch/x86/include/asm/invlpgb.h +@@ -67,9 +67,15 @@ static inline void invlpgb_flush_user(un + static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, + unsigned long addr, + u16 nr, +- bool pmd_stride) ++ bool pmd_stride, ++ bool freed_tables) + { +- __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); ++ unsigned long flags = INVLPGB_PCID | INVLPGB_VA; ++ ++ if (!freed_tables) ++ flags |= INVLPGB_FINAL_ONLY; ++ ++ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, flags); + } + + /* Flush all mappings for a given PCID, not including globals. */ +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -518,10 +518,10 @@ static void broadcast_tlb_flush(struct f + nr = min(maxnr, (info->end - addr) >> info->stride_shift); + nr = max(nr, 1); + +- invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); ++ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd, info->freed_tables); + /* Do any CPUs supporting INVLPGB need PTI? */ + if (static_cpu_has(X86_FEATURE_PTI)) +- invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd); ++ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd, info->freed_tables); + + addr += nr << info->stride_shift; + } while (addr < info->end); +@@ -1654,10 +1654,10 @@ void arch_tlbbatch_add_pending(struct ar + batch->used_invlpgb = true; + migrate_disable(); + } +- invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false); ++ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false, false); + /* Do any CPUs supporting INVLPGB need PTI? */ + if (static_cpu_has(X86_FEATURE_PTI)) +- invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false); ++ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false, false); + + /* + * Some CPUs might still be using a local ASID for this diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch b/debian/patches/patchset-zen/nvlpgb-v7/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch new file mode 100644 index 0000000..f005552 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch @@ -0,0 +1,94 @@ +From b61dfc43cfc7511795366dfd9260f0959ca2f2d2 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Thu, 19 Dec 2024 15:32:53 -0500 +Subject: mm: remove unnecessary calls to lru_add_drain + +There seem to be several categories of calls to lru_add_drain +and lru_add_drain_all. + +The first are code paths that recently allocated, swapped in, +or otherwise processed a batch of pages, and want them all on +the LRU. These drain pages that were recently allocated, +probably on the local CPU. + +A second category are code paths that are actively trying to +reclaim, migrate, or offline memory. These often use lru_add_drain_all, +to drain the caches on all CPUs. + +However, there also seem to be some other callers where we +aren't really doing either. They are calling lru_add_drain(), +despite operating on pages that may have been allocated +long ago, and quite possibly on different CPUs. + +Those calls are not likely to be effective at anything but +creating lock contention on the LRU locks. + +Remove the lru_add_drain calls in the latter category. + +Signed-off-by: Rik van Riel +Suggested-by: David Hildenbrand +Acked-by: Shakeel Butt +Acked-by: David Hildenbrand +--- + mm/memory.c | 1 - + mm/mmap.c | 2 -- + mm/swap_state.c | 1 - + mm/vma.c | 2 -- + 4 files changed, 6 deletions(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are + struct mmu_notifier_range range; + struct mmu_gather tlb; + +- lru_add_drain(); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, + address, end); + hugetlb_zap_begin(vma, &range.start, &range.end); +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1931,7 +1931,6 @@ void exit_mmap(struct mm_struct *mm) + goto destroy; + } + +- lru_add_drain(); + flush_cache_mm(mm); + tlb_gather_mmu_fullmm(&tlb, mm); + /* update_hiwater_rss(mm) here? but nobody should be looking */ +@@ -2374,7 +2373,6 @@ int relocate_vma_down(struct vm_area_str + vma, new_start, length, false, true)) + return -ENOMEM; + +- lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); + if (new_end > old_start) { +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en + struct folio_batch folios; + unsigned int refs[PAGEVEC_SIZE]; + +- lru_add_drain(); + folio_batch_init(&folios); + for (int i = 0; i < nr; i++) { + struct folio *folio = page_folio(encoded_page_ptr(pages[i])); +--- a/mm/vma.c ++++ b/mm/vma.c +@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas, + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + +- lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + update_hiwater_rss(mm); + unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, +@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct + * were isolated before we downgraded mmap_lock. + */ + mas_set(mas_detach, 1); +- lru_add_drain(); + tlb_gather_mmu(&tlb, vms->vma->vm_mm); + update_hiwater_rss(vms->vma->vm_mm); + unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0014-vdso-Introduce-vdso-page.h.patch b/debian/patches/patchset-zen/nvlpgb-v7/0014-vdso-Introduce-vdso-page.h.patch new file mode 100644 index 0000000..5f745f0 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0014-vdso-Introduce-vdso-page.h.patch @@ -0,0 +1,429 @@ +From e2d1ffb13e3909dab142f0f8ec8f934b79930717 Mon Sep 17 00:00:00 2001 +From: Vincenzo Frascino +Date: Mon, 14 Oct 2024 16:13:39 +0100 +Subject: vdso: Introduce vdso/page.h + +The VDSO implementation includes headers from outside of the +vdso/ namespace. + +Introduce vdso/page.h to make sure that the generic library +uses only the allowed namespace. + +Signed-off-by: Vincenzo Frascino +Signed-off-by: Thomas Gleixner +Reviewed-by: Arnd Bergmann +Acked-by: Geert Uytterhoeven # m68k +Link: https://lore.kernel.org/all/20241014151340.1639555-3-vincenzo.frascino@arm.com +--- + arch/alpha/include/asm/page.h | 6 +----- + arch/arc/include/uapi/asm/page.h | 7 +++---- + arch/arm/include/asm/page.h | 5 +---- + arch/arm64/include/asm/page-def.h | 5 +---- + arch/csky/include/asm/page.h | 8 ++------ + arch/hexagon/include/asm/page.h | 4 +--- + arch/loongarch/include/asm/page.h | 7 +------ + arch/m68k/include/asm/page.h | 6 ++---- + arch/microblaze/include/asm/page.h | 5 +---- + arch/mips/include/asm/page.h | 7 +------ + arch/nios2/include/asm/page.h | 7 +------ + arch/openrisc/include/asm/page.h | 11 +---------- + arch/parisc/include/asm/page.h | 4 +--- + arch/powerpc/include/asm/page.h | 10 +--------- + arch/riscv/include/asm/page.h | 4 +--- + arch/s390/include/asm/page.h | 13 +++++-------- + arch/sh/include/asm/page.h | 6 ++---- + arch/sparc/include/asm/page_32.h | 4 +--- + arch/sparc/include/asm/page_64.h | 4 +--- + arch/um/include/asm/page.h | 5 +---- + arch/x86/include/asm/page_types.h | 5 +---- + arch/xtensa/include/asm/page.h | 8 +------- + include/vdso/page.h | 30 ++++++++++++++++++++++++++++++ + 23 files changed, 61 insertions(+), 110 deletions(-) + create mode 100644 include/vdso/page.h + +--- a/arch/alpha/include/asm/page.h ++++ b/arch/alpha/include/asm/page.h +@@ -4,11 +4,7 @@ + + #include + #include +- +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #ifndef __ASSEMBLY__ + +--- a/arch/arc/include/uapi/asm/page.h ++++ b/arch/arc/include/uapi/asm/page.h +@@ -14,7 +14,7 @@ + + /* PAGE_SHIFT determines the page size */ + #ifdef __KERNEL__ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT ++#include + #else + /* + * Default 8k +@@ -24,11 +24,10 @@ + * not available + */ + #define PAGE_SHIFT 13 ++#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */ ++#define PAGE_MASK (~(PAGE_SIZE-1)) + #endif + +-#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */ + #define PAGE_OFFSET _AC(0x80000000, UL) /* Kernel starts at 2G onwrds */ + +-#define PAGE_MASK (~(PAGE_SIZE-1)) +- + #endif /* _UAPI__ASM_ARC_PAGE_H */ +--- a/arch/arm/include/asm/page.h ++++ b/arch/arm/include/asm/page.h +@@ -7,10 +7,7 @@ + #ifndef _ASMARM_PAGE_H + #define _ASMARM_PAGE_H + +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) ++#include + + #ifndef __ASSEMBLY__ + +--- a/arch/arm64/include/asm/page-def.h ++++ b/arch/arm64/include/asm/page-def.h +@@ -10,9 +10,6 @@ + + #include + +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #endif /* __ASM_PAGE_DEF_H */ +--- a/arch/csky/include/asm/page.h ++++ b/arch/csky/include/asm/page.h +@@ -7,12 +7,8 @@ + #include + #include + +-/* +- * PAGE_SHIFT determines the page size: 4KB +- */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE - 1)) ++#include ++ + #define THREAD_SIZE (PAGE_SIZE * 2) + #define THREAD_MASK (~(THREAD_SIZE - 1)) + #define THREAD_SHIFT (PAGE_SHIFT + 1) +--- a/arch/hexagon/include/asm/page.h ++++ b/arch/hexagon/include/asm/page.h +@@ -45,9 +45,7 @@ + #define HVM_HUGEPAGE_SIZE 0x5 + #endif + +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (1UL << PAGE_SHIFT) +-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) ++#include + + #ifdef __KERNEL__ + #ifndef __ASSEMBLY__ +--- a/arch/loongarch/include/asm/page.h ++++ b/arch/loongarch/include/asm/page.h +@@ -8,12 +8,7 @@ + #include + #include + +-/* +- * PAGE_SHIFT determines the page size +- */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE - 1)) ++#include + + #define HPAGE_SHIFT (PAGE_SHIFT + PAGE_SHIFT - 3) + #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) +--- a/arch/m68k/include/asm/page.h ++++ b/arch/m68k/include/asm/page.h +@@ -6,10 +6,8 @@ + #include + #include + +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include ++ + #define PAGE_OFFSET (PAGE_OFFSET_RAW) + + #ifndef __ASSEMBLY__ +--- a/arch/microblaze/include/asm/page.h ++++ b/arch/microblaze/include/asm/page.h +@@ -19,10 +19,7 @@ + + #ifdef __KERNEL__ + +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #define LOAD_OFFSET ASM_CONST((CONFIG_KERNEL_START-CONFIG_KERNEL_BASE_ADDR)) + +--- a/arch/mips/include/asm/page.h ++++ b/arch/mips/include/asm/page.h +@@ -14,12 +14,7 @@ + #include + #include + +-/* +- * PAGE_SHIFT determines the page size +- */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) ++#include + + /* + * This is used for calculating the real page sizes +--- a/arch/nios2/include/asm/page.h ++++ b/arch/nios2/include/asm/page.h +@@ -18,12 +18,7 @@ + #include + #include + +-/* +- * PAGE_SHIFT determines the page size +- */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE - 1)) ++#include + + /* + * PAGE_OFFSET -- the first address of the first page of memory. +--- a/arch/openrisc/include/asm/page.h ++++ b/arch/openrisc/include/asm/page.h +@@ -15,16 +15,7 @@ + #ifndef __ASM_OPENRISC_PAGE_H + #define __ASM_OPENRISC_PAGE_H + +- +-/* PAGE_SHIFT determines the page size */ +- +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#ifdef __ASSEMBLY__ +-#define PAGE_SIZE (1 << PAGE_SHIFT) +-#else +-#define PAGE_SIZE (1UL << PAGE_SHIFT) +-#endif +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #define PAGE_OFFSET 0xc0000000 + #define KERNELBASE PAGE_OFFSET +--- a/arch/parisc/include/asm/page.h ++++ b/arch/parisc/include/asm/page.h +@@ -4,9 +4,7 @@ + + #include + +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA + +--- a/arch/powerpc/include/asm/page.h ++++ b/arch/powerpc/include/asm/page.h +@@ -21,8 +21,7 @@ + * page size. When using 64K pages however, whether we are really supporting + * 64K pages in HW or not is irrelevant to those definitions. + */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) ++#include + + #ifndef __ASSEMBLY__ + #ifndef CONFIG_HUGETLB_PAGE +@@ -42,13 +41,6 @@ extern unsigned int hpage_shift; + #endif + + /* +- * Subtle: (1 << PAGE_SHIFT) is an int, not an unsigned long. So if we +- * assign PAGE_MASK to a larger type it gets extended the way we want +- * (i.e. with 1s in the high bits) +- */ +-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1)) +- +-/* + * KERNELBASE is the virtual address of the start of the kernel, it's often + * the same as PAGE_OFFSET, but _might not be_. + * +--- a/arch/riscv/include/asm/page.h ++++ b/arch/riscv/include/asm/page.h +@@ -12,9 +12,7 @@ + #include + #include + +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE - 1)) ++#include + + #define HPAGE_SHIFT PMD_SHIFT + #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) +--- a/arch/s390/include/asm/page.h ++++ b/arch/s390/include/asm/page.h +@@ -11,14 +11,11 @@ + #include + #include + +-#define _PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT) +-#define _PAGE_MASK (~(_PAGE_SIZE - 1)) ++#include + +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT _PAGE_SHIFT +-#define PAGE_SIZE _PAGE_SIZE +-#define PAGE_MASK _PAGE_MASK ++#define _PAGE_SHIFT PAGE_SHIFT ++#define _PAGE_SIZE PAGE_SIZE ++#define _PAGE_MASK PAGE_MASK + #define PAGE_DEFAULT_ACC _AC(0, UL) + /* storage-protection override */ + #define PAGE_SPO_ACC 9 +--- a/arch/sh/include/asm/page.h ++++ b/arch/sh/include/asm/page.h +@@ -8,10 +8,8 @@ + + #include + +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include ++ + #define PTE_MASK PAGE_MASK + + #if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) +--- a/arch/sparc/include/asm/page_32.h ++++ b/arch/sparc/include/asm/page_32.h +@@ -11,9 +11,7 @@ + + #include + +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #ifndef __ASSEMBLY__ + +--- a/arch/sparc/include/asm/page_64.h ++++ b/arch/sparc/include/asm/page_64.h +@@ -4,9 +4,7 @@ + + #include + +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + /* Flushing for D-cache alias handling is only needed if + * the page size is smaller than 16K. +--- a/arch/um/include/asm/page.h ++++ b/arch/um/include/asm/page.h +@@ -9,10 +9,7 @@ + + #include + +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #ifndef __ASSEMBLY__ + +--- a/arch/x86/include/asm/page_types.h ++++ b/arch/x86/include/asm/page_types.h +@@ -6,10 +6,7 @@ + #include + #include + +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) + +--- a/arch/xtensa/include/asm/page.h ++++ b/arch/xtensa/include/asm/page.h +@@ -18,13 +18,7 @@ + #include + #include + +-/* +- * PAGE_SHIFT determines the page size +- */ +- +-#define PAGE_SHIFT CONFIG_PAGE_SHIFT +-#define PAGE_SIZE (__XTENSA_UL_CONST(1) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #ifdef CONFIG_MMU + #define PAGE_OFFSET XCHAL_KSEG_CACHED_VADDR +--- /dev/null ++++ b/include/vdso/page.h +@@ -0,0 +1,30 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef __VDSO_PAGE_H ++#define __VDSO_PAGE_H ++ ++#include ++ ++/* ++ * PAGE_SHIFT determines the page size. ++ * ++ * Note: This definition is required because PAGE_SHIFT is used ++ * in several places throuout the codebase. ++ */ ++#define PAGE_SHIFT CONFIG_PAGE_SHIFT ++ ++#define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT) ++ ++#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT) ++/* ++ * Applies only to 32-bit architectures with a 64-bit phys_addr_t. ++ * ++ * Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long. ++ * So if we assign PAGE_MASK to a larger type it gets extended the ++ * way we want (i.e. with 1s in the high bits) ++ */ ++#define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1)) ++#else ++#define PAGE_MASK (~(PAGE_SIZE - 1)) ++#endif ++ ++#endif /* __VDSO_PAGE_H */ diff --git a/debian/patches/patchset-zen/nvlpgb-v7/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch b/debian/patches/patchset-zen/nvlpgb-v7/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch new file mode 100644 index 0000000..c457087 --- /dev/null +++ b/debian/patches/patchset-zen/nvlpgb-v7/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch @@ -0,0 +1,68 @@ +From 4478ee194402472199e05d3e27a87f0fc775cc18 Mon Sep 17 00:00:00 2001 +From: Arnd Bergmann +Date: Thu, 24 Oct 2024 13:34:26 +0000 +Subject: vdso: Change PAGE_MASK to signed on all 32-bit architectures + +With the introduction of an architecture-independent defintion of +PAGE_MASK, we had to make a choice between defining it as 'unsigned long' +as on 64-bit architectures, or as signed 'long' as required for +architectures with a 64-bit phys_addr_t. + +To reduce the risk for regressions and minimize the changes in behavior, +the result was using the signed value only when CONFIG_PHYS_ADDR_T_64BIT +is set, but that ended up causing a regression after all in the +early_init_dt_add_memory_arch() function that uses 64-bit integers for +address calculation. + +Presumably the same regression also affects mips32 and powerpc32 when +dealing with large amounts of memory on DT platforms: like arm32, they were +using the signed version unconditionally. + +The two most sensible options for addressing the regression are either to +go back to an architecture specific definition, using a signed constant on +arm/powerpc/mips and unsigned on the others, or to use the same definition +everywhere. + +Use the simpler of those two and change them all to the signed version, in +the hope that this does not cause a different type of bug. Most of the +other 32-bit architectures have no large physical address support and are +rarely used, so it seems more likely that using the same definition helps +than hurts here. + +In particular, x86-32 does have physical addressing extensions, so it +already changed to the signed version after the previous patch, so it makes +sense to use the same version on non-PAE as well. + +Fixes: efe8419ae78d ("vdso: Introduce vdso/page.h") +Reported-by: Naresh Kamboju +Signed-off-by: Arnd Bergmann +Signed-off-by: Thomas Gleixner +Tested-by: Anders Roxell +Tested-by: Vincenzo Frascino +Reviewed-by: Vincenzo Frascino +Link: https://lore.kernel.org/all/20241024133447.3117273-1-arnd@kernel.org +Link: https://lore.kernel.org/lkml/CA+G9fYt86bUAu_v5dXPWnDUwQNVipj+Wq3Djir1KUSKdr9QLNg@mail.gmail.com/ +--- + include/vdso/page.h | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/include/vdso/page.h ++++ b/include/vdso/page.h +@@ -14,13 +14,14 @@ + + #define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT) + +-#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT) ++#if !defined(CONFIG_64BIT) + /* +- * Applies only to 32-bit architectures with a 64-bit phys_addr_t. ++ * Applies only to 32-bit architectures. + * + * Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long. + * So if we assign PAGE_MASK to a larger type it gets extended the +- * way we want (i.e. with 1s in the high bits) ++ * way we want (i.e. with 1s in the high bits) while masking a ++ * 64-bit value such as phys_addr_t. + */ + #define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1)) + #else diff --git a/debian/patches/patchset-zen/sauce/0010-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch b/debian/patches/patchset-zen/sauce/0010-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch index f19e80f..4ed1c7d 100644 --- a/debian/patches/patchset-zen/sauce/0010-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch +++ b/debian/patches/patchset-zen/sauce/0010-ZEN-mm-Stop-kswapd-early-when-nothing-s-waiting-for-.patch @@ -102,7 +102,7 @@ Contains: --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -6353,7 +6353,7 @@ retry: +@@ -6384,7 +6384,7 @@ retry: return 0; } @@ -111,7 +111,7 @@ Contains: { struct zone *zone; unsigned long pfmemalloc_reserve = 0; -@@ -6382,6 +6382,10 @@ static bool allow_direct_reclaim(pg_data +@@ -6413,6 +6413,10 @@ static bool allow_direct_reclaim(pg_data wmark_ok = free_pages > pfmemalloc_reserve / 2; @@ -122,7 +122,7 @@ Contains: /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) -@@ -6447,7 +6451,7 @@ static bool throttle_direct_reclaim(gfp_ +@@ -6478,7 +6482,7 @@ static bool throttle_direct_reclaim(gfp_ /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; @@ -131,7 +131,7 @@ Contains: goto out; break; } -@@ -6469,11 +6473,14 @@ static bool throttle_direct_reclaim(gfp_ +@@ -6500,11 +6504,14 @@ static bool throttle_direct_reclaim(gfp_ */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, @@ -148,7 +148,7 @@ Contains: if (fatal_signal_pending(current)) return true; -@@ -6976,14 +6983,14 @@ restart: +@@ -7007,14 +7014,14 @@ restart: * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && diff --git a/debian/patches/patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch b/debian/patches/patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch new file mode 100644 index 0000000..69708ac --- /dev/null +++ b/debian/patches/patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch @@ -0,0 +1,194 @@ +From eacae6d88bcc8a925124f97b7788bb2bfac8b267 Mon Sep 17 00:00:00 2001 +From: Vinay Banakar +Date: Mon, 20 Jan 2025 16:47:29 -0600 +Subject: mm: Optimize TLB flushes during page reclaim + +The current implementation in shrink_folio_list() performs full TLB +flushes and issues IPIs for each individual page being reclaimed. This +causes unnecessary overhead during memory reclaim, whether triggered +by madvise(MADV_PAGEOUT) or kswapd, especially in scenarios where +applications are actively moving cold pages to swap while maintaining +high performance requirements for hot pages. + +The current code: +1. Clears PTE and unmaps each page individually +2. Performs a full TLB flush on all cores using the VMA (via CR3 write) or +issues individual TLB shootdowns (invlpg+invlpcid) for single-core usage +3. Submits each page individually to BIO + +This approach results in: +- Excessive full TLB flushes across all cores +- Unnecessary IPI storms when processing multiple pages +- Suboptimal I/O submission patterns + +I initially tried using selective TLB shootdowns (invlpg) instead of +full TLB flushes per each page to avoid interference with other +threads. However, this approach still required sending IPIs to all +cores for each page, which did not significantly improve application +throughput. + +This patch instead optimizes the process by batching operations, +issuing one IPI per PMD instead of per page. This reduces interrupts +by a factor of 512 and enables batching page submissions to BIO. The +new approach: +1. Collect dirty pages that need to be written back +2. Issue a single TLB flush for all dirty pages in the batch +3. Process the collected pages for writebacks (submit to BIO) + +Testing shows significant reduction in application throughput impact +during page-out operations. Applications maintain better performance +during memory reclaim, when triggered by explicit +madvise(MADV_PAGEOUT) calls. + +I'd appreciate your feedback on this approach, especially on the +correctness of batched BIO submissions. Looking forward to your +comments. + +Signed-off-by: Vinay Banakar +--- + mm/vmscan.c | 120 ++++++++++++++++++++++++++++++++-------------------- + 1 file changed, 74 insertions(+), 46 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1053,6 +1053,7 @@ static unsigned int shrink_folio_list(st + struct folio_batch free_folios; + LIST_HEAD(ret_folios); + LIST_HEAD(demote_folios); ++ LIST_HEAD(pageout_list); + unsigned int nr_reclaimed = 0; + unsigned int pgactivate = 0; + bool do_demote_pass; +@@ -1365,52 +1366,9 @@ retry: + if (!sc->may_writepage) + goto keep_locked; + +- /* +- * Folio is dirty. Flush the TLB if a writable entry +- * potentially exists to avoid CPU writes after I/O +- * starts and then write it out here. +- */ +- try_to_unmap_flush_dirty(); +- switch (pageout(folio, mapping, &plug, folio_list)) { +- case PAGE_KEEP: +- goto keep_locked; +- case PAGE_ACTIVATE: +- /* +- * If shmem folio is split when writeback to swap, +- * the tail pages will make their own pass through +- * this function and be accounted then. +- */ +- if (nr_pages > 1 && !folio_test_large(folio)) { +- sc->nr_scanned -= (nr_pages - 1); +- nr_pages = 1; +- } +- goto activate_locked; +- case PAGE_SUCCESS: +- if (nr_pages > 1 && !folio_test_large(folio)) { +- sc->nr_scanned -= (nr_pages - 1); +- nr_pages = 1; +- } +- stat->nr_pageout += nr_pages; +- +- if (folio_test_writeback(folio)) +- goto keep; +- if (folio_test_dirty(folio)) +- goto keep; +- +- /* +- * A synchronous write - probably a ramdisk. Go +- * ahead and try to reclaim the folio. +- */ +- if (!folio_trylock(folio)) +- goto keep; +- if (folio_test_dirty(folio) || +- folio_test_writeback(folio)) +- goto keep_locked; +- mapping = folio_mapping(folio); +- fallthrough; +- case PAGE_CLEAN: +- ; /* try to free the folio below */ +- } ++ /* Add to pageout list for defered bio submissions */ ++ list_add(&folio->lru, &pageout_list); ++ continue; + } + + /* +@@ -1521,6 +1479,76 @@ keep: + } + /* 'folio_list' is always empty here */ + ++ if (!list_empty(&pageout_list)) { ++ /* ++ * Batch TLB flushes by flushing once before processing all dirty pages. ++ * Since we operate on one PMD at a time, this batches TLB flushes at ++ * PMD granularity rather than per-page, reducing IPIs. ++ */ ++ struct address_space *mapping; ++ try_to_unmap_flush_dirty(); ++ ++ while (!list_empty(&pageout_list)) { ++ struct folio *folio = lru_to_folio(&pageout_list); ++ list_del(&folio->lru); ++ ++ /* Recheck if page got reactivated */ ++ if (folio_test_active(folio) || ++ (folio_mapped(folio) && folio_test_young(folio))) ++ goto skip_pageout_locked; ++ ++ mapping = folio_mapping(folio); ++ pageout_t pageout_res = pageout(folio, mapping, &plug); ++ switch (pageout_res) { ++ case PAGE_KEEP: ++ goto skip_pageout_locked; ++ case PAGE_ACTIVATE: ++ goto skip_pageout_locked; ++ case PAGE_SUCCESS: ++ stat->nr_pageout += folio_nr_pages(folio); ++ ++ if (folio_test_writeback(folio) || ++ folio_test_dirty(folio)) ++ goto skip_pageout; ++ ++ /* ++ * A synchronous write - probably a ramdisk. Go ++ * ahead and try to reclaim the folio. ++ */ ++ if (!folio_trylock(folio)) ++ goto skip_pageout; ++ if (folio_test_dirty(folio) || ++ folio_test_writeback(folio)) ++ goto skip_pageout_locked; ++ ++ // Try to free the page ++ if (!mapping || ++ !__remove_mapping(mapping, folio, true, ++ sc->target_mem_cgroup)) ++ goto skip_pageout_locked; ++ ++ nr_reclaimed += folio_nr_pages(folio); ++ folio_unlock(folio); ++ continue; ++ ++ case PAGE_CLEAN: ++ if (!mapping || ++ !__remove_mapping(mapping, folio, true, ++ sc->target_mem_cgroup)) ++ goto skip_pageout_locked; ++ ++ nr_reclaimed += folio_nr_pages(folio); ++ folio_unlock(folio); ++ continue; ++ } ++ ++skip_pageout_locked: ++ folio_unlock(folio); ++skip_pageout: ++ list_add(&folio->lru, &ret_folios); ++ } ++ } ++ + /* Migrate folios selected for demotion */ + stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); + nr_reclaimed += stat->nr_demoted; diff --git a/debian/patches/series b/debian/patches/series index 0a51c7d..2bc6155 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -47,7 +47,6 @@ features/x86/intel-iommu-add-option-to-exclude-integrated-gpu-only.patch features/x86/intel-iommu-add-kconfig-option-to-exclude-igpu-by-default.patch # Disable autoloading/probing of various drivers by default -debian/cdc_ncm-cdc_mbim-use-ncm-by-default.patch debian/snd-pcsp-disable-autoload.patch bugfix/x86/viafb-autoload-on-olpc-xo1.5-only.patch debian/fjes-disable-autoload.patch @@ -203,6 +202,24 @@ patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch +patchset-zen/nvlpgb-v7/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch +patchset-zen/nvlpgb-v7/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch +patchset-zen/nvlpgb-v7/0003-x86-mm-consolidate-full-flush-threshold-decision.patch +patchset-zen/nvlpgb-v7/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch +patchset-zen/nvlpgb-v7/0005-x86-mm-add-INVLPGB-support-code.patch +patchset-zen/nvlpgb-v7/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch +patchset-zen/nvlpgb-v7/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch +patchset-zen/nvlpgb-v7/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch +patchset-zen/nvlpgb-v7/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch +patchset-zen/nvlpgb-v7/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch +patchset-zen/nvlpgb-v7/0011-x86-mm-enable-AMD-translation-cache-extensions.patch +patchset-zen/nvlpgb-v7/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch +patchset-zen/nvlpgb-v7/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch +patchset-zen/nvlpgb-v7/0014-vdso-Introduce-vdso-page.h.patch +patchset-zen/nvlpgb-v7/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch + +patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch + patchset-xanmod/xanmod/0001-kbuild-Remove-GCC-minimal-function-alignment.patch patchset-xanmod/xanmod/0002-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch patchset-xanmod/xanmod/0003-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch