release 6.12.13

2025-02-09 19:40:34 +03:00
parent b8c80400f5
commit e0baaf49dd
26 changed files with 385 additions and 221 deletions
--- a/debian/patches/patchset-zen/invlpgb-v9/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
@@ -0,0 +1,124 @@
+From e11153c4df0fee7caadec3714a60a4936d6a9ea2 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:20 -0500
+Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional
+
+Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
+paravirt, and not when running on bare metal.
+
+There is no real good reason to do things differently for
+each setup. Make them all the same.
+
+Currently get_user_pages_fast synchronizes against page table
+freeing in two different ways:
+- on bare metal, by blocking IRQs, which block TLB flush IPIs
+- on paravirt, with MMU_GATHER_RCU_TABLE_FREE
+
+This is done because some paravirt TLB flush implementations
+handle the TLB flush in the hypervisor, and will do the flush
+even when the target CPU has interrupts disabled.
+
+Always handle page table freeing with MMU_GATHER_RCU_TABLE_FREE.
+Using RCU synchronization between page table freeing and get_user_pages_fast()
+allows bare metal to also do TLB flushing while interrupts are disabled.
+
+Various places in the mm do still block IRQs or disable preemption
+as an implicit way to block RCU frees.
+
+That makes it safe to use INVLPGB on AMD CPUs.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Suggested-by: Peter Zijlstra <peterz@infradead.org>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/Kconfig           |  2 +-
+ arch/x86/kernel/paravirt.c |  7 +------
+ arch/x86/mm/pgtable.c      | 16 ++++------------
+ 3 files changed, 6 insertions(+), 19 deletions(-)
+
+--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
+@@ -270,7 +270,7 @@ config X86
+ 	select HAVE_PCI
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
+-	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
+	select MMU_GATHER_RCU_TABLE_FREE
+ 	select MMU_GATHER_MERGE_VMAS
+ 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
+--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
+@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void)
+ 		static_branch_enable(&virt_spin_lock_key);
+ }
+ 
+-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	tlb_remove_page(tlb, table);
+-}
+-
+ struct static_key paravirt_steal_enabled;
+ struct static_key paravirt_steal_rq_enabled;
+ 
+@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops =
+ 	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
+ 	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
+ 	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
+-	.mmu.tlb_remove_table	= native_tlb_remove_table,
+	.mmu.tlb_remove_table	= tlb_remove_table,
+ 
+ 	.mmu.exit_mmap		= paravirt_nop,
+ 	.mmu.notify_page_enc_status_changed	= paravirt_nop,
+--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
+@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask);
+ #define PGTABLE_HIGHMEM 0
+ #endif
+ 
+-#ifndef CONFIG_PARAVIRT
+-static inline
+-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	tlb_remove_page(tlb, table);
+-}
+-#endif
+-
+ gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
+ 
+ pgtable_t pte_alloc_one(struct mm_struct *mm)
+@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather *
+ {
+ 	pagetable_pte_dtor(page_ptdesc(pte));
+ 	paravirt_release_pte(page_to_pfn(pte));
+-	paravirt_tlb_remove_table(tlb, pte);
+	tlb_remove_table(tlb, pte);
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 2
+@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather *
+ 	tlb->need_flush_all = 1;
+ #endif
+ 	pagetable_pmd_dtor(ptdesc);
+-	paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
+	tlb_remove_table(tlb, ptdesc_page(ptdesc));
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 3
+@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather *
+ 
+ 	pagetable_pud_dtor(ptdesc);
+ 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+-	paravirt_tlb_remove_table(tlb, virt_to_page(pud));
+	tlb_remove_table(tlb, virt_to_page(pud));
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 4
+ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+ {
+ 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+-	paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
+	tlb_remove_table(tlb, virt_to_page(p4d));
+ }
+ #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
+ #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
--- a/debian/patches/patchset-zen/invlpgb-v9/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
@@ -0,0 +1,85 @@
+From e8008cb69c5e4efbaedd70b0fb692343e4aa0e51 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:21 -0500
+Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call
+
+Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
+
+Get rid of the indirection by simply calling tlb_remove_table directly,
+and not going through the paravirt function pointers.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/hyperv/mmu.c                 | 1 -
+ arch/x86/include/asm/paravirt.h       | 5 -----
+ arch/x86/include/asm/paravirt_types.h | 2 --
+ arch/x86/kernel/kvm.c                 | 1 -
+ arch/x86/kernel/paravirt.c            | 1 -
+ arch/x86/xen/mmu_pv.c                 | 1 -
+ 6 files changed, 11 deletions(-)
+
+--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
+@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void)
+ 
+ 	pr_info("Using hypercall for remote TLB flush\n");
+ 	pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
+-	pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ }
+--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
+@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
+ 	PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
+ }
+ 
+-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
+-}
+-
+ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
+ {
+ 	PVOP_VCALL1(mmu.exit_mmap, mm);
+--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
+@@ -136,8 +136,6 @@ struct pv_mmu_ops {
+ 	void (*flush_tlb_multi)(const struct cpumask *cpus,
+ 				const struct flush_tlb_info *info);
+ 
+-	void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
+-
+ 	/* Hook for intercepting the destruction of an mm_struct. */
+ 	void (*exit_mmap)(struct mm_struct *mm);
+ 	void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
+--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
+@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
+ #ifdef CONFIG_SMP
+ 	if (pv_tlb_flush_supported()) {
+ 		pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
+-		pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ 		pr_info("KVM setup pv remote TLB flush\n");
+ 	}
+ 
+--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
+@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops =
+ 	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
+ 	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
+ 	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
+-	.mmu.tlb_remove_table	= tlb_remove_table,
+ 
+ 	.mmu.exit_mmap		= paravirt_nop,
+ 	.mmu.notify_page_enc_status_changed	= paravirt_nop,
+--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
+@@ -2137,7 +2137,6 @@ static const typeof(pv_ops) xen_mmu_ops
+ 		.flush_tlb_kernel = xen_flush_tlb,
+ 		.flush_tlb_one_user = xen_flush_tlb_one_user,
+ 		.flush_tlb_multi = xen_flush_tlb_multi,
+-		.tlb_remove_table = tlb_remove_table,
+ 
+ 		.pgd_alloc = xen_pgd_alloc,
+ 		.pgd_free = xen_pgd_free,
--- a/debian/patches/patchset-zen/invlpgb-v9/0003-x86-mm-consolidate-full-flush-threshold-decision.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0003-x86-mm-consolidate-full-flush-threshold-decision.patch
@@ -0,0 +1,112 @@
+From 7ac6508c4db81eced5f6e3d7c8913af1da6cf110 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:22 -0500
+Subject: x86/mm: consolidate full flush threshold decision
+
+Reduce code duplication by consolidating the decision point
+for whether to do individual invalidations or a full flush
+inside get_flush_tlb_info.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Suggested-by: Dave Hansen <dave.hansen@intel.com>
+---
+ arch/x86/mm/tlb.c | 56 ++++++++++++++++++++++++++---------------------
+ 1 file changed, 31 insertions(+), 25 deletions(-)
+
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -973,14 +973,32 @@ static struct flush_tlb_info *get_flush_
+ 	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
+ #endif
+ 
+-	info->start		= start;
+-	info->end		= end;
+	/*
+	 * Round the start and end addresses to the page size specified
+	 * by the stride shift. This ensures partial pages at the end of
+	 * a range get fully invalidated.
+	 */
+	info->start		= round_down(start, 1 << stride_shift);
+	info->end		= round_up(end, 1 << stride_shift);
+ 	info->mm		= mm;
+ 	info->stride_shift	= stride_shift;
+ 	info->freed_tables	= freed_tables;
+ 	info->new_tlb_gen	= new_tlb_gen;
+ 	info->initiating_cpu	= smp_processor_id();
+ 
+	WARN_ONCE(start != info->start || end != info->end,
+		  "TLB flush not stride %x aligned. Start %lx, end %lx\n",
+		  1 << stride_shift, start, end);
+
+	/*
+	 * If the number of flushes is so large that a full flush
+	 * would be faster, do a full flush.
+	 */
+	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+		info->start = 0;
+		info->end = TLB_FLUSH_ALL;
+	}
+
+ 	return info;
+ }
+ 
+@@ -998,17 +1016,8 @@ void flush_tlb_mm_range(struct mm_struct
+ 				bool freed_tables)
+ {
+ 	struct flush_tlb_info *info;
+	int cpu = get_cpu();
+ 	u64 new_tlb_gen;
+-	int cpu;
+-
+-	cpu = get_cpu();
+-
+-	/* Should we flush just the requested range? */
+-	if ((end == TLB_FLUSH_ALL) ||
+-	    ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
+-		start = 0;
+-		end = TLB_FLUSH_ALL;
+-	}
+ 
+ 	/* This is also a barrier that synchronizes with switch_mm(). */
+ 	new_tlb_gen = inc_mm_tlb_gen(mm);
+@@ -1060,22 +1069,19 @@ static void do_kernel_range_flush(void *
+ 
+ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+ {
+-	/* Balance as user space task's flush, a bit conservative */
+-	if (end == TLB_FLUSH_ALL ||
+-	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
+-		on_each_cpu(do_flush_tlb_all, NULL, 1);
+-	} else {
+-		struct flush_tlb_info *info;
+	struct flush_tlb_info *info;
+
+	guard(preempt)();
+ 
+-		preempt_disable();
+-		info = get_flush_tlb_info(NULL, start, end, 0, false,
+-					  TLB_GENERATION_INVALID);
+	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+				  TLB_GENERATION_INVALID);
+ 
+	if (info->end == TLB_FLUSH_ALL)
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+	else
+ 		on_each_cpu(do_kernel_range_flush, info, 1);
+ 
+-		put_flush_tlb_info();
+-		preempt_enable();
+-	}
+	put_flush_tlb_info();
+ }
+ 
+ /*
+@@ -1247,7 +1253,7 @@ void arch_tlbbatch_flush(struct arch_tlb
+ 
+ 	int cpu = get_cpu();
+ 
+-	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
+	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, PAGE_SHIFT, false,
+ 				  TLB_GENERATION_INVALID);
+ 	/*
+ 	 * flush_tlb_multi() is not optimized for the common case in which only
--- a/debian/patches/patchset-zen/invlpgb-v9/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
@@ -0,0 +1,90 @@
+From e772b2eb66e5c3cf668feadab678f2a88d896189 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:23 -0500
+Subject: x86/mm: get INVLPGB count max from CPUID
+
+The CPU advertises the maximum number of pages that can be shot down
+with one INVLPGB instruction in the CPUID data.
+
+Save that information for later use.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/Kconfig.cpu               | 5 +++++
+ arch/x86/include/asm/cpufeatures.h | 1 +
+ arch/x86/include/asm/tlbflush.h    | 7 +++++++
+ arch/x86/kernel/cpu/amd.c          | 8 ++++++++
+ 4 files changed, 21 insertions(+)
+
+--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
+@@ -726,6 +726,10 @@ config X86_VMX_FEATURE_NAMES
+ 	def_bool y
+ 	depends on IA32_FEAT_CTL
+ 
+config X86_BROADCAST_TLB_FLUSH
+	def_bool y
+	depends on CPU_SUP_AMD && 64BIT
+
+ menuconfig PROCESSOR_SELECT
+ 	bool "Supported processor vendors" if EXPERT
+ 	help
+@@ -762,6 +766,7 @@ config CPU_SUP_CYRIX_32
+ config CPU_SUP_AMD
+ 	default y
+ 	bool "Support AMD processors" if PROCESSOR_SELECT
+	select X86_BROADCAST_TLB_FLUSH
+ 	help
+ 	  This enables detection, tunings and quirks for AMD processors
+ 
+--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
+@@ -335,6 +335,7 @@
+ #define X86_FEATURE_CLZERO		(13*32+ 0) /* "clzero" CLZERO instruction */
+ #define X86_FEATURE_IRPERF		(13*32+ 1) /* "irperf" Instructions Retired Count */
+ #define X86_FEATURE_XSAVEERPTR		(13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_INVLPGB		(13*32+ 3) /* INVLPGB and TLBSYNC instruction supported. */
+ #define X86_FEATURE_RDPRU		(13*32+ 4) /* "rdpru" Read processor register at user level */
+ #define X86_FEATURE_WBNOINVD		(13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
+ #define X86_FEATURE_AMD_IBPB		(13*32+12) /* Indirect Branch Prediction Barrier */
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -183,6 +183,13 @@ static inline void cr4_init_shadow(void)
+ extern unsigned long mmu_cr4_features;
+ extern u32 *trampoline_cr4_features;
+ 
+/* How many pages can we invalidate with one INVLPGB. */
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+extern u16 invlpgb_count_max;
+#else
+#define invlpgb_count_max 1
+#endif
+
+ extern void initialize_tlbstate_and_flush(void);
+ 
+ /*
+--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
+@@ -29,6 +29,8 @@
+ 
+ #include "cpu.h"
+ 
+u16 invlpgb_count_max __ro_after_init;
+
+ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+ {
+ 	u32 gprs[8] = { 0 };
+@@ -1135,6 +1137,12 @@ static void cpu_detect_tlb_amd(struct cp
+ 		tlb_lli_2m[ENTRIES] = eax & mask;
+ 
+ 	tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+	/* Max number of pages INVLPGB can invalidate in one shot */
+	if (boot_cpu_has(X86_FEATURE_INVLPGB)) {
+		cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+		invlpgb_count_max = (edx & 0xffff) + 1;
+	}
+ }
+ 
+ static const struct cpu_dev amd_cpu_dev = {
--- a/debian/patches/patchset-zen/invlpgb-v9/0005-x86-mm-add-INVLPGB-support-code.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0005-x86-mm-add-INVLPGB-support-code.patch
@@ -0,0 +1,130 @@
+From 7a896b12875e2b988acbf0229fb4bcf9157b83bd Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:24 -0500
+Subject: x86/mm: add INVLPGB support code
+
+Add invlpgb.h with the helper functions and definitions needed to use
+broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/include/asm/invlpgb.h  | 101 ++++++++++++++++++++++++++++++++
+ arch/x86/include/asm/tlbflush.h |   1 +
+ 2 files changed, 102 insertions(+)
+ create mode 100644 arch/x86/include/asm/invlpgb.h
+
+--- /dev/null
+++ b/arch/x86/include/asm/invlpgb.h
+@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVLPGB
+#define _ASM_X86_INVLPGB
+
+#include <linux/kernel.h>
+#include <vdso/bits.h>
+#include <vdso/page.h>
+
+/*
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+ *
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+ * be done in a parallel fashion.
+ *
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+ * this CPU have completed.
+ */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+			     unsigned long addr, u16 extra_count,
+			     bool pmd_stride, u8 flags)
+{
+	u32 edx = (pcid << 16) | asid;
+	u32 ecx = (pmd_stride << 31) | extra_count;
+	u64 rax = addr | flags;
+
+	/* The low bits in rax are for flags. Verify addr is clean. */
+	VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
+
+	/* INVLPGB; supported in binutils >= 2.36. */
+	asm volatile(".byte 0x0f, 0x01, 0xfe" : : "a" (rax), "c" (ecx), "d" (edx));
+}
+
+/* Wait for INVLPGB originated by this CPU to complete. */
+static inline void tlbsync(void)
+{
+	cant_migrate();
+	/* TLBSYNC: supported in binutils >= 0.36. */
+	asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
+}
+
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - INVLPGB_PCID:			  invalidate all TLB entries matching the PCID
+ *
+ * The first can be used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_VA			BIT(0)
+#define INVLPGB_PCID			BIT(1)
+#define INVLPGB_ASID			BIT(2)
+#define INVLPGB_INCLUDE_GLOBAL		BIT(3)
+#define INVLPGB_FINAL_ONLY		BIT(4)
+#define INVLPGB_INCLUDE_NESTED		BIT(5)
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invlpgb_flush_user(unsigned long pcid,
+				      unsigned long addr)
+{
+	__invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
+	tlbsync();
+}
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						unsigned long addr,
+						u16 nr,
+						bool pmd_stride)
+{
+	__invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+	__invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invlpgb_flush_all(void)
+{
+	__invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
+	tlbsync();
+}
+
+/* Flush addr, including globals, for all PCIDs. */
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+	__invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invlpgb_flush_all_nonglobals(void)
+{
+	__invlpgb(0, 0, 0, 0, 0, 0);
+	tlbsync();
+}
+
+#endif /* _ASM_X86_INVLPGB */
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -10,6 +10,7 @@
+ #include <asm/cpufeature.h>
+ #include <asm/special_insns.h>
+ #include <asm/smp.h>
+#include <asm/invlpgb.h>
+ #include <asm/invpcid.h>
+ #include <asm/pti.h>
+ #include <asm/processor-flags.h>
--- a/debian/patches/patchset-zen/invlpgb-v9/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
@@ -0,0 +1,59 @@
+From 99f2b0eda74d7ec76c9c48b78f9d30d251501c28 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:25 -0500
+Subject: x86/mm: use INVLPGB for kernel TLB flushes
+
+Use broadcast TLB invalidation for kernel addresses when available.
+
+Remove the need to send IPIs for kernel TLB flushes.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/mm/tlb.c | 28 +++++++++++++++++++++++++++-
+ 1 file changed, 27 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -1057,6 +1057,30 @@ void flush_tlb_all(void)
+ 	on_each_cpu(do_flush_tlb_all, NULL, 1);
+ }
+ 
+static bool broadcast_kernel_range_flush(struct flush_tlb_info *info)
+{
+	unsigned long addr;
+	unsigned long nr;
+
+	if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH))
+		return false;
+
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return false;
+
+	if (info->end == TLB_FLUSH_ALL) {
+		invlpgb_flush_all();
+		return true;
+	}
+
+	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+		nr = min((info->end - addr) >> PAGE_SHIFT, invlpgb_count_max);
+		invlpgb_flush_addr_nosync(addr, nr);
+	}
+	tlbsync();
+	return true;
+}
+
+ static void do_kernel_range_flush(void *info)
+ {
+ 	struct flush_tlb_info *f = info;
+@@ -1076,7 +1100,9 @@ void flush_tlb_kernel_range(unsigned lon
+ 	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+ 				  TLB_GENERATION_INVALID);
+ 
+-	if (info->end == TLB_FLUSH_ALL)
+	if (broadcast_kernel_range_flush(info))
+		; /* Fall through. */
+	else if (info->end == TLB_FLUSH_ALL)
+ 		on_each_cpu(do_flush_tlb_all, NULL, 1);
+ 	else
+ 		on_each_cpu(do_kernel_range_flush, info, 1);
--- a/debian/patches/patchset-zen/invlpgb-v9/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch
@@ -0,0 +1,45 @@
+From 1ef7edb5b2375d4010ed2ad0c7d87fcfa7ab4519 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:26 -0500
+Subject: x86/mm: use INVLPGB in flush_tlb_all
+
+The flush_tlb_all() function is not used a whole lot, but we might
+as well use broadcast TLB flushing there, too.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/mm/tlb.c | 15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -1045,6 +1045,19 @@ void flush_tlb_mm_range(struct mm_struct
+ }
+ 
+ 
+static bool broadcast_flush_tlb_all(void)
+{
+	if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH))
+		return false;
+
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return false;
+
+	guard(preempt)();
+	invlpgb_flush_all();
+	return true;
+}
+
+ static void do_flush_tlb_all(void *info)
+ {
+ 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+@@ -1053,6 +1066,8 @@ static void do_flush_tlb_all(void *info)
+ 
+ void flush_tlb_all(void)
+ {
+	if (broadcast_flush_tlb_all())
+		return;
+ 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ 	on_each_cpu(do_flush_tlb_all, NULL, 1);
+ }
--- a/debian/patches/patchset-zen/invlpgb-v9/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
@@ -0,0 +1,30 @@
+From 5e5219596683c3b8178e09f6ec1e75154537325f Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:27 -0500
+Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing
+
+In the page reclaim code, we only track the CPU(s) where the TLB needs
+to be flushed, rather than all the individual mappings that may be getting
+invalidated.
+
+Use broadcast TLB flushing when that is available.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/mm/tlb.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -1301,7 +1301,9 @@ void arch_tlbbatch_flush(struct arch_tlb
+ 	 * a local TLB flush is needed. Optimize this use-case by calling
+ 	 * flush_tlb_func_local() directly in this case.
+ 	 */
+-	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+		invlpgb_flush_all_nonglobals();
+	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ 		flush_tlb_multi(&batch->cpumask, info);
+ 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ 		lockdep_assert_irqs_enabled();
--- a/debian/patches/patchset-zen/invlpgb-v9/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
@@ -0,0 +1,602 @@
+From c7212dc64d8e9e4f12f1c6edea3b75c350a30381 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:28 -0500
+Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded
+ processes
+
+Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3
+and newer CPUs.
+
+In order to not exhaust PCID space, and keep TLB flushes local for single
+threaded processes, we only hand out broadcast ASIDs to processes active on
+4 or more CPUs.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/include/asm/mmu.h         |   6 +
+ arch/x86/include/asm/mmu_context.h |  14 ++
+ arch/x86/include/asm/tlbflush.h    |  73 ++++++
+ arch/x86/mm/tlb.c                  | 344 ++++++++++++++++++++++++++++-
+ 4 files changed, 425 insertions(+), 12 deletions(-)
+
+--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
+@@ -67,6 +67,12 @@ typedef struct {
+ 	u16 pkey_allocation_map;
+ 	s16 execute_only_pkey;
+ #endif
+
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+	u16 global_asid;
+	bool asid_transition;
+#endif
+
+ } mm_context_t;
+ 
+ #define INIT_MM_CONTEXT(mm)						\
+--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
+@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
+ #define enter_lazy_tlb enter_lazy_tlb
+ extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+ 
+extern void destroy_context_free_global_asid(struct mm_struct *mm);
+
+ /*
+  * Init a new mm.  Used on mm copies, like at fork()
+  * and on mm's that are brand-new, like at execve().
+@@ -160,6 +162,14 @@ static inline int init_new_context(struc
+ 		mm->context.execute_only_pkey = -1;
+ 	}
+ #endif
+
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+		mm->context.global_asid = 0;
+		mm->context.asid_transition = false;
+	}
+#endif
+
+ 	mm_reset_untag_mask(mm);
+ 	init_new_context_ldt(mm);
+ 	return 0;
+@@ -169,6 +179,10 @@ static inline int init_new_context(struc
+ static inline void destroy_context(struct mm_struct *mm)
+ {
+ 	destroy_context_ldt(mm);
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		destroy_context_free_global_asid(mm);
+#endif
+ }
+ 
+ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -6,6 +6,7 @@
+ #include <linux/mmu_notifier.h>
+ #include <linux/sched.h>
+ 
+#include <asm/barrier.h>
+ #include <asm/processor.h>
+ #include <asm/cpufeature.h>
+ #include <asm/special_insns.h>
+@@ -238,6 +239,78 @@ void flush_tlb_one_kernel(unsigned long
+ void flush_tlb_multi(const struct cpumask *cpumask,
+ 		      const struct flush_tlb_info *info);
+ 
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+static inline bool is_dyn_asid(u16 asid)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return true;
+
+	return asid < TLB_NR_DYN_ASIDS;
+}
+
+static inline bool is_global_asid(u16 asid)
+{
+	return !is_dyn_asid(asid);
+}
+
+static inline bool in_asid_transition(struct mm_struct *mm)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return false;
+
+	return mm && READ_ONCE(mm->context.asid_transition);
+}
+
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+	u16 asid;
+
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return 0;
+
+	asid = smp_load_acquire(&mm->context.global_asid);
+
+	/* mm->context.global_asid is either 0, or a global ASID */
+	VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
+
+	return asid;
+}
+#else
+static inline bool is_dyn_asid(u16 asid)
+{
+	return true;
+}
+
+static inline bool is_global_asid(u16 asid)
+{
+	return false;
+}
+
+static inline bool in_asid_transition(struct mm_struct *mm)
+{
+	return false;
+}
+
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
+{
+	return false;
+}
+
+static inline void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+	VM_WARN_ON_ONCE(1);
+}
+
+static inline void consider_global_asid(struct mm_struct *mm)
+{
+}
+#endif
+
+ #ifdef CONFIG_PARAVIRT
+ #include <asm/paravirt.h>
+ #endif
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -74,13 +74,15 @@
+  * use different names for each of them:
+  *
+  * ASID  - [0, TLB_NR_DYN_ASIDS-1]
+- *         the canonical identifier for an mm
+ *         the canonical identifier for an mm, dynamically allocated on each CPU
+ *         [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
+ *         the canonical, global identifier for an mm, identical across all CPUs
+  *
+- * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * kPCID - [1, MAX_ASID_AVAILABLE]
+  *         the value we write into the PCID part of CR3; corresponds to the
+  *         ASID+1, because PCID 0 is special.
+  *
+- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
+  *         for KPTI each mm has two address spaces and thus needs two
+  *         PCID values, but we can still do with a single ASID denomination
+  *         for each mm. Corresponds to kPCID + 2048.
+@@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_st
+ 		return;
+ 	}
+ 
+	/*
+	 * TLB consistency for global ASIDs is maintained with broadcast TLB
+	 * flushing. The TLB is never outdated, and does not need flushing.
+	 */
+	if (IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH) && static_cpu_has(X86_FEATURE_INVLPGB)) {
+		u16 global_asid = mm_global_asid(next);
+
+		if (global_asid) {
+			*new_asid = global_asid;
+			*need_flush = false;
+			return;
+		}
+	}
+
+ 	if (this_cpu_read(cpu_tlbstate.invalidate_other))
+ 		clear_asid_other();
+ 
+@@ -251,6 +267,272 @@ static void choose_new_asid(struct mm_st
+ 	*need_flush = true;
+ }
+ 
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+/*
+ * Logic for broadcast TLB invalidation.
+ */
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE) = { 0 };
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE) = { 0 };
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
+
+static void reset_global_asid_space(void)
+{
+	lockdep_assert_held(&global_asid_lock);
+
+	/*
+	 * A global TLB flush guarantees that any stale entries from
+	 * previously freed global ASIDs get flushed from the TLB
+	 * everywhere, making these global ASIDs safe to reuse.
+	 */
+	invlpgb_flush_all_nonglobals();
+
+	/*
+	 * Clear all the previously freed global ASIDs from the
+	 * broadcast_asid_used bitmap, now that the global TLB flush
+	 * has made them actually available for re-use.
+	 */
+	bitmap_andnot(global_asid_used, global_asid_used,
+			global_asid_freed, MAX_ASID_AVAILABLE);
+	bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
+
+	/*
+	 * ASIDs 0-TLB_NR_DYN_ASIDS are used for CPU-local ASID
+	 * assignments, for tasks doing IPI based TLB shootdowns.
+	 * Restart the search from the start of the global ASID space.
+	 */
+	last_global_asid = TLB_NR_DYN_ASIDS;
+}
+
+static u16 get_global_asid(void)
+{
+
+	u16 asid;
+
+	lockdep_assert_held(&global_asid_lock);
+
+	/* The previous allocated ASID is at the top of the address space. */
+	if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
+		reset_global_asid_space();
+
+	asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
+
+	if (asid >= MAX_ASID_AVAILABLE) {
+		/* This should never happen. */
+		VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", global_asid_available);
+		return 0;
+	}
+
+	/* Claim this global ASID. */
+	__set_bit(asid, global_asid_used);
+	last_global_asid = asid;
+	global_asid_available--;
+	return asid;
+}
+
+/*
+ * Returns true if the mm is transitioning from a CPU-local ASID to a global
+ * (INVLPGB) ASID, or the other way around.
+ */
+static bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
+{
+	u16 global_asid = mm_global_asid(next);
+
+	if (global_asid && prev_asid != global_asid)
+		return true;
+
+	if (!global_asid && is_global_asid(prev_asid))
+		return true;
+
+	return false;
+}
+
+void destroy_context_free_global_asid(struct mm_struct *mm)
+{
+	if (!mm->context.global_asid)
+		return;
+
+	guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+	/* The global ASID can be re-used only after flush at wrap-around. */
+	__set_bit(mm->context.global_asid, global_asid_freed);
+
+	mm->context.global_asid = 0;
+	global_asid_available++;
+}
+
+/*
+ * Check whether a process is currently active on more than "threshold" CPUs.
+ * This is a cheap estimation on whether or not it may make sense to assign
+ * a global ASID to this process, and use broadcast TLB invalidation.
+ */
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
+{
+	int count = 0;
+	int cpu;
+
+	/* This quick check should eliminate most single threaded programs. */
+	if (cpumask_weight(mm_cpumask(mm)) <= threshold)
+		return false;
+
+	/* Slower check to make sure. */
+	for_each_cpu(cpu, mm_cpumask(mm)) {
+		/* Skip the CPUs that aren't really running this process. */
+		if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
+			continue;
+
+		if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+			continue;
+
+		if (++count > threshold)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Assign a global ASID to the current process, protecting against
+ * races between multiple threads in the process.
+ */
+static void use_global_asid(struct mm_struct *mm)
+{
+	u16 asid;
+
+	guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+	/* This process is already using broadcast TLB invalidation. */
+	if (mm->context.global_asid)
+		return;
+
+	/* The last global ASID was consumed while waiting for the lock. */
+	if (!global_asid_available) {
+		VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
+		return;
+	}
+
+	asid = get_global_asid();
+	if (!asid)
+		return;
+
+	/*
+	 * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
+	 * finish_asid_transition() needs to observe asid_transition = true
+	 * once it observes global_asid.
+	 */
+	mm->context.asid_transition = true;
+	smp_store_release(&mm->context.global_asid, asid);
+}
+
+static bool meets_global_asid_threshold(struct mm_struct *mm)
+{
+	if (!global_asid_available)
+		return false;
+
+	/*
+	 * Assign a global ASID if the process is active on
+	 * 4 or more CPUs simultaneously.
+	 */
+	return mm_active_cpus_exceeds(mm, 3);
+}
+
+static void consider_global_asid(struct mm_struct *mm)
+{
+	if (!static_cpu_has(X86_FEATURE_INVLPGB))
+		return;
+
+	/* Check every once in a while. */
+	if ((current->pid & 0x1f) != (jiffies & 0x1f))
+		return;
+
+	if (meets_global_asid_threshold(mm))
+		use_global_asid(mm);
+}
+
+static void finish_asid_transition(struct flush_tlb_info *info)
+{
+	struct mm_struct *mm = info->mm;
+	int bc_asid = mm_global_asid(mm);
+	int cpu;
+
+	if (!READ_ONCE(mm->context.asid_transition))
+		return;
+
+	for_each_cpu(cpu, mm_cpumask(mm)) {
+		/*
+		 * The remote CPU is context switching. Wait for that to
+		 * finish, to catch the unlikely case of it switching to
+		 * the target mm with an out of date ASID.
+		 */
+		while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
+			cpu_relax();
+
+		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
+			continue;
+
+		/*
+		 * If at least one CPU is not using the global ASID yet,
+		 * send a TLB flush IPI. The IPI should cause stragglers
+		 * to transition soon.
+		 *
+		 * This can race with the CPU switching to another task;
+		 * that results in a (harmless) extra IPI.
+		 */
+		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
+			flush_tlb_multi(mm_cpumask(info->mm), info);
+			return;
+		}
+	}
+
+	/* All the CPUs running this process are using the global ASID. */
+	WRITE_ONCE(mm->context.asid_transition, false);
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+	bool pmd = info->stride_shift == PMD_SHIFT;
+	unsigned long maxnr = invlpgb_count_max;
+	unsigned long asid = info->mm->context.global_asid;
+	unsigned long addr = info->start;
+	unsigned long nr;
+
+	/* Flushing multiple pages at once is not supported with 1GB pages. */
+	if (info->stride_shift > PMD_SHIFT)
+		maxnr = 1;
+
+	/*
+	 * TLB flushes with INVLPGB are kicked off asynchronously.
+	 * The inc_mm_tlb_gen() guarantees page table updates are done
+	 * before these TLB flushes happen.
+	 */
+	if (info->end == TLB_FLUSH_ALL) {
+		invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (static_cpu_has(X86_FEATURE_PTI))
+			invlpgb_flush_single_pcid_nosync(user_pcid(asid));
+	} else do {
+		/*
+		 * Calculate how many pages can be flushed at once; if the
+		 * remainder of the range is less than one page, flush one.
+		 */
+		nr = min(maxnr, (info->end - addr) >> info->stride_shift);
+		nr = max(nr, 1);
+
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (static_cpu_has(X86_FEATURE_PTI))
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+
+		addr += nr << info->stride_shift;
+	} while (addr < info->end);
+
+	finish_asid_transition(info);
+
+	/* Wait for the INVLPGBs kicked off above to finish. */
+	tlbsync();
+}
+#endif /* CONFIG_X86_BROADCAST_TLB_FLUSH */
+
+ /*
+  * Given an ASID, flush the corresponding user ASID.  We can delay this
+  * until the next time we switch to it.
+@@ -556,8 +838,9 @@ void switch_mm_irqs_off(struct mm_struct
+ 	 */
+ 	if (prev == next) {
+ 		/* Not actually switching mm's */
+-		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+-			   next->context.ctx_id);
+		VM_WARN_ON(is_dyn_asid(prev_asid) &&
+				this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+				next->context.ctx_id);
+ 
+ 		/*
+ 		 * If this races with another thread that enables lam, 'new_lam'
+@@ -574,6 +857,23 @@ void switch_mm_irqs_off(struct mm_struct
+ 			cpumask_set_cpu(cpu, mm_cpumask(next));
+ 
+ 		/*
+		 * Check if the current mm is transitioning to a new ASID.
+		 */
+		if (needs_global_asid_reload(next, prev_asid)) {
+			next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+			choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+			goto reload_tlb;
+		}
+
+		/*
+		 * Broadcast TLB invalidation keeps this PCID up to date
+		 * all the time.
+		 */
+		if (is_global_asid(prev_asid))
+			return;
+
+		/*
+ 		 * If the CPU is not in lazy TLB mode, we are just switching
+ 		 * from one thread in a process to another thread in the same
+ 		 * process. No TLB flush required.
+@@ -607,6 +907,13 @@ void switch_mm_irqs_off(struct mm_struct
+ 		cond_mitigation(tsk);
+ 
+ 		/*
+		 * Let nmi_uaccess_okay() and finish_asid_transition()
+		 * know that we're changing CR3.
+		 */
+		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+		barrier();
+
+		/*
+ 		 * Stop remote flushes for the previous mm.
+ 		 * Skip kernel threads; we never send init_mm TLB flushing IPIs,
+ 		 * but the bitmap manipulation can cause cache line contention.
+@@ -623,14 +930,12 @@ void switch_mm_irqs_off(struct mm_struct
+ 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ 
+ 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+-
+-		/* Let nmi_uaccess_okay() know that we're changing CR3. */
+-		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+-		barrier();
+ 	}
+ 
+reload_tlb:
+ 	new_lam = mm_lam_cr3_mask(next);
+ 	if (need_flush) {
+		VM_WARN_ON_ONCE(is_global_asid(new_asid));
+ 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ 		load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
+@@ -749,7 +1054,7 @@ static void flush_tlb_func(void *info)
+ 	const struct flush_tlb_info *f = info;
+ 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ 	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+-	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+	u64 local_tlb_gen;
+ 	bool local = smp_processor_id() == f->initiating_cpu;
+ 	unsigned long nr_invalidate = 0;
+ 	u64 mm_tlb_gen;
+@@ -769,6 +1074,16 @@ static void flush_tlb_func(void *info)
+ 	if (unlikely(loaded_mm == &init_mm))
+ 		return;
+ 
+	/* Reload the ASID if transitioning into or out of a global ASID */
+	if (needs_global_asid_reload(loaded_mm, loaded_mm_asid)) {
+		switch_mm_irqs_off(NULL, loaded_mm, NULL);
+		loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+	}
+
+	/* Broadcast ASIDs are always kept up to date with INVLPGB. */
+	if (is_global_asid(loaded_mm_asid))
+		return;
+
+ 	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+ 		   loaded_mm->context.ctx_id);
+ 
+@@ -786,6 +1101,8 @@ static void flush_tlb_func(void *info)
+ 		return;
+ 	}
+ 
+	local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
+ 	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
+ 		     f->new_tlb_gen <= local_tlb_gen)) {
+ 		/*
+@@ -926,7 +1243,7 @@ STATIC_NOPV void native_flush_tlb_multi(
+ 	 * up on the new contents of what used to be page tables, while
+ 	 * doing a speculative memory access.
+ 	 */
+-	if (info->freed_tables)
+	if (info->freed_tables || in_asid_transition(info->mm))
+ 		on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
+ 	else
+ 		on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+@@ -1030,8 +1347,11 @@ void flush_tlb_mm_range(struct mm_struct
+ 	 * a local TLB flush is needed. Optimize this use-case by calling
+ 	 * flush_tlb_func_local() directly in this case.
+ 	 */
+-	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+	if (mm_global_asid(mm)) {
+		broadcast_tlb_flush(info);
+	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ 		flush_tlb_multi(mm_cpumask(mm), info);
+		consider_global_asid(mm);
+ 	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ 		lockdep_assert_irqs_enabled();
+ 		local_irq_disable();
--- a/debian/patches/patchset-zen/invlpgb-v9/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch
@@ -0,0 +1,251 @@
+From 6f601cdcd33be8fc0da98c6bab777575af3260b8 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:29 -0500
+Subject: x86/mm: do targeted broadcast flushing from tlbbatch code
+
+Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
+queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
+
+This also allows us to avoid adding the CPUs of processes using broadcast
+flushing to the batch->cpumask, and will hopefully further reduce TLB
+flushing from the reclaim and compaction paths.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/include/asm/invlpgb.h  | 21 +++++----
+ arch/x86/include/asm/tlbflush.h | 17 ++++---
+ arch/x86/mm/tlb.c               | 80 +++++++++++++++++++++++++++++++--
+ 3 files changed, 95 insertions(+), 23 deletions(-)
+
+--- a/arch/x86/include/asm/invlpgb.h
+++ b/arch/x86/include/asm/invlpgb.h
+@@ -31,9 +31,8 @@ static inline void __invlpgb(unsigned lo
+ }
+ 
+ /* Wait for INVLPGB originated by this CPU to complete. */
+-static inline void tlbsync(void)
+static inline void __tlbsync(void)
+ {
+-	cant_migrate();
+ 	/* TLBSYNC: supported in binutils >= 0.36. */
+ 	asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
+ }
+@@ -61,19 +60,19 @@ static inline void invlpgb_flush_user(un
+ 				      unsigned long addr)
+ {
+ 	__invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
+-	tlbsync();
+	__tlbsync();
+ }
+ 
+-static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+-						unsigned long addr,
+-						u16 nr,
+-						bool pmd_stride)
+static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						  unsigned long addr,
+						  u16 nr,
+						  bool pmd_stride)
+ {
+ 	__invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+ }
+ 
+ /* Flush all mappings for a given PCID, not including globals. */
+-static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+static inline void __invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+ {
+ 	__invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
+ }
+@@ -82,11 +81,11 @@ static inline void invlpgb_flush_single_
+ static inline void invlpgb_flush_all(void)
+ {
+ 	__invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
+-	tlbsync();
+	__tlbsync();
+ }
+ 
+ /* Flush addr, including globals, for all PCIDs. */
+-static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+static inline void __invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+ {
+ 	__invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
+ }
+@@ -95,7 +94,7 @@ static inline void invlpgb_flush_addr_no
+ static inline void invlpgb_flush_all_nonglobals(void)
+ {
+ 	__invlpgb(0, 0, 0, 0, 0, 0);
+-	tlbsync();
+	__tlbsync();
+ }
+ 
+ #endif /* _ASM_X86_INVLPGB */
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -106,6 +106,7 @@ struct tlb_state {
+ 	 * need to be invalidated.
+ 	 */
+ 	bool invalidate_other;
+	bool need_tlbsync;
+ 
+ #ifdef CONFIG_ADDRESS_MASKING
+ 	/*
+@@ -309,6 +310,10 @@ static inline void broadcast_tlb_flush(s
+ static inline void consider_global_asid(struct mm_struct *mm)
+ {
+ }
+
+static inline void tlbsync(void)
+{
+}
+ #endif
+ 
+ #ifdef CONFIG_PARAVIRT
+@@ -358,21 +363,15 @@ static inline u64 inc_mm_tlb_gen(struct
+ 	return atomic64_inc_return(&mm->context.tlb_gen);
+ }
+ 
+-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+-					     struct mm_struct *mm,
+-					     unsigned long uaddr)
+-{
+-	inc_mm_tlb_gen(mm);
+-	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+-}
+-
+ static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+ {
+ 	flush_tlb_mm(mm);
+ }
+ 
+ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr);
+ 
+ static inline bool pte_flags_need_flush(unsigned long oldflags,
+ 					unsigned long newflags,
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -488,6 +488,37 @@ static void finish_asid_transition(struc
+ 	WRITE_ONCE(mm->context.asid_transition, false);
+ }
+ 
+static inline void tlbsync(void)
+{
+	if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+		return;
+	__tlbsync();
+	this_cpu_write(cpu_tlbstate.need_tlbsync, false);
+}
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						unsigned long addr,
+						u16 nr, bool pmd_stride)
+{
+	__invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
+	if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+		this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+}
+
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+	__invlpgb_flush_single_pcid_nosync(pcid);
+	if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+		this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+}
+
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+	__invlpgb_flush_addr_nosync(addr, nr);
+	if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+		this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+}
+
+ static void broadcast_tlb_flush(struct flush_tlb_info *info)
+ {
+ 	bool pmd = info->stride_shift == PMD_SHIFT;
+@@ -794,6 +825,8 @@ void switch_mm_irqs_off(struct mm_struct
+ 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ 		WARN_ON_ONCE(!irqs_disabled());
+ 
+	tlbsync();
+
+ 	/*
+ 	 * Verify that CR3 is what we think it is.  This will catch
+ 	 * hypothetical buggy code that directly switches to swapper_pg_dir
+@@ -976,6 +1009,8 @@ reload_tlb:
+  */
+ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+ {
+	tlbsync();
+
+ 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+ 		return;
+ 
+@@ -1621,9 +1656,7 @@ void arch_tlbbatch_flush(struct arch_tlb
+ 	 * a local TLB flush is needed. Optimize this use-case by calling
+ 	 * flush_tlb_func_local() directly in this case.
+ 	 */
+-	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+-		invlpgb_flush_all_nonglobals();
+-	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ 		flush_tlb_multi(&batch->cpumask, info);
+ 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ 		lockdep_assert_irqs_enabled();
+@@ -1632,12 +1665,53 @@ void arch_tlbbatch_flush(struct arch_tlb
+ 		local_irq_enable();
+ 	}
+ 
+	/*
+	 * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+	 * The cpumask above contains only CPUs that were running tasks
+	 * not using broadcast TLB flushing.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		tlbsync();
+
+ 	cpumask_clear(&batch->cpumask);
+ 
+ 	put_flush_tlb_info();
+ 	put_cpu();
+ }
+ 
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
+{
+	u16 asid = mm_global_asid(mm);
+
+	if (asid) {
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (static_cpu_has(X86_FEATURE_PTI))
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+
+		/*
+		 * Some CPUs might still be using a local ASID for this
+		 * process, and require IPIs, while others are using the
+		 * global ASID.
+		 *
+		 * In this corner case we need to do both the broadcast
+		 * TLB invalidation, and send IPIs. The IPIs will help
+		 * stragglers transition to the broadcast ASID.
+		 */
+		if (in_asid_transition(mm))
+			asid = 0;
+	}
+
+	if (!asid) {
+		inc_mm_tlb_gen(mm);
+		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+	}
+
+	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
+ /*
+  * Blindly accessing user memory from NMI context can be dangerous
+  * if we're in the middle of switching the current user task or
--- a/debian/patches/patchset-zen/invlpgb-v9/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
@@ -0,0 +1,80 @@
+From 101ba03a6474bbc52971505abf1e3ee9613f255b Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:30 -0500
+Subject: x86/mm: enable AMD translation cache extensions
+
+With AMD TCE (translation cache extensions) only the intermediate mappings
+that cover the address range zapped by INVLPG / INVLPGB get invalidated,
+rather than all intermediate mappings getting zapped at every TLB invalidation.
+
+This can help reduce the TLB miss rate, by keeping more intermediate
+mappings in the cache.
+
+From the AMD manual:
+
+Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit
+to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on
+TLB entries. When this bit is 0, these instructions remove the target PTE
+from the TLB as well as all upper-level table entries that are cached
+in the TLB, whether or not they are associated with the target PTE.
+When this bit is set, these instructions will remove the target PTE and
+only those upper-level entries that lead to the target PTE in
+the page table hierarchy, leaving unrelated upper-level entries intact.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/include/asm/msr-index.h       | 2 ++
+ arch/x86/kernel/cpu/amd.c              | 4 ++++
+ tools/arch/x86/include/asm/msr-index.h | 2 ++
+ 3 files changed, 8 insertions(+)
+
+--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
+@@ -25,6 +25,7 @@
+ #define _EFER_SVME		12 /* Enable virtualization */
+ #define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
+ #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE		15 /* Enable Translation Cache Extensions */
+ #define _EFER_AUTOIBRS		21 /* Enable Automatic IBRS */
+ 
+ #define EFER_SCE		(1<<_EFER_SCE)
+@@ -34,6 +35,7 @@
+ #define EFER_SVME		(1<<_EFER_SVME)
+ #define EFER_LMSLE		(1<<_EFER_LMSLE)
+ #define EFER_FFXSR		(1<<_EFER_FFXSR)
+#define EFER_TCE		(1<<_EFER_TCE)
+ #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
+ 
+ /*
+--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
+@@ -1071,6 +1071,10 @@ static void init_amd(struct cpuinfo_x86
+ 
+ 	/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ 	clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+	/* Enable Translation Cache Extension */
+	if (cpu_feature_enabled(X86_FEATURE_TCE))
+		msr_set_bit(MSR_EFER, _EFER_TCE);
+ }
+ 
+ #ifdef CONFIG_X86_32
+--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
+@@ -25,6 +25,7 @@
+ #define _EFER_SVME		12 /* Enable virtualization */
+ #define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
+ #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE		15 /* Enable Translation Cache Extensions */
+ #define _EFER_AUTOIBRS		21 /* Enable Automatic IBRS */
+ 
+ #define EFER_SCE		(1<<_EFER_SCE)
+@@ -34,6 +35,7 @@
+ #define EFER_SVME		(1<<_EFER_SVME)
+ #define EFER_LMSLE		(1<<_EFER_LMSLE)
+ #define EFER_FFXSR		(1<<_EFER_FFXSR)
+#define EFER_TCE		(1<<_EFER_TCE)
+ #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
+ 
+ /*
--- a/debian/patches/patchset-zen/invlpgb-v9/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
@@ -0,0 +1,80 @@
+From 7b8ef03b059bca98d2af696c3ec2adcaa673f7e4 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 5 Feb 2025 23:43:31 -0500
+Subject: x86/mm: only invalidate final translations with INVLPGB
+
+Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB.
+This way only leaf mappings get removed from the TLB, leaving intermediate
+translations cached.
+
+On the (rare) occasions where we free page tables we do a full flush,
+ensuring intermediate translations get flushed from the TLB.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+---
+ arch/x86/include/asm/invlpgb.h | 10 ++++++++--
+ arch/x86/mm/tlb.c              | 13 +++++++------
+ 2 files changed, 15 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/include/asm/invlpgb.h
+++ b/arch/x86/include/asm/invlpgb.h
+@@ -66,9 +66,15 @@ static inline void invlpgb_flush_user(un
+ static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ 						  unsigned long addr,
+ 						  u16 nr,
+-						  bool pmd_stride)
+						  bool pmd_stride,
+						  bool freed_tables)
+ {
+-	__invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+	u8 flags = INVLPGB_PCID | INVLPGB_VA;
+
+	if (!freed_tables)
+		flags |= INVLPGB_FINAL_ONLY;
+
+	__invlpgb(0, pcid, addr, nr - 1, pmd_stride, flags);
+ }
+ 
+ /* Flush all mappings for a given PCID, not including globals. */
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -498,9 +498,10 @@ static inline void tlbsync(void)
+ 
+ static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ 						unsigned long addr,
+-						u16 nr, bool pmd_stride)
+						u16 nr, bool pmd_stride,
+						bool freed_tables)
+ {
+-	__invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
+	__invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride, freed_tables);
+ 	if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+ 		this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+ }
+@@ -549,10 +550,10 @@ static void broadcast_tlb_flush(struct f
+ 		nr = min(maxnr, (info->end - addr) >> info->stride_shift);
+ 		nr = max(nr, 1);
+ 
+-		invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd, info->freed_tables);
+ 		/* Do any CPUs supporting INVLPGB need PTI? */
+ 		if (static_cpu_has(X86_FEATURE_PTI))
+-			invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd, info->freed_tables);
+ 
+ 		addr += nr << info->stride_shift;
+ 	} while (addr < info->end);
+@@ -1686,10 +1687,10 @@ void arch_tlbbatch_add_pending(struct ar
+ 	u16 asid = mm_global_asid(mm);
+ 
+ 	if (asid) {
+-		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false, false);
+ 		/* Do any CPUs supporting INVLPGB need PTI? */
+ 		if (static_cpu_has(X86_FEATURE_PTI))
+-			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false, false);
+ 
+ 		/*
+ 		 * Some CPUs might still be using a local ASID for this
--- a/debian/patches/patchset-zen/invlpgb-v9/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
@@ -0,0 +1,94 @@
+From 7b0836fcad644d24d6318bf63013ec1b35d6a27b Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Thu, 19 Dec 2024 15:32:53 -0500
+Subject: mm: remove unnecessary calls to lru_add_drain
+
+There seem to be several categories of calls to lru_add_drain
+and lru_add_drain_all.
+
+The first are code paths that recently allocated, swapped in,
+or otherwise processed a batch of pages, and want them all on
+the LRU. These drain pages that were recently allocated,
+probably on the local CPU.
+
+A second category are code paths that are actively trying to
+reclaim, migrate, or offline memory. These often use lru_add_drain_all,
+to drain the caches on all CPUs.
+
+However, there also seem to be some other callers where we
+aren't really doing either. They are calling lru_add_drain(),
+despite operating on pages that may have been allocated
+long ago, and quite possibly on different CPUs.
+
+Those calls are not likely to be effective at anything but
+creating lock contention on the LRU locks.
+
+Remove the lru_add_drain calls in the latter category.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Suggested-by: David Hildenbrand <david@redhat.com>
+Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
+Acked-by: David Hildenbrand <david@redhat.com>
+---
+ mm/memory.c     | 1 -
+ mm/mmap.c       | 2 --
+ mm/swap_state.c | 1 -
+ mm/vma.c        | 2 --
+ 4 files changed, 6 deletions(-)
+
+--- a/mm/memory.c
+++ b/mm/memory.c
+@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are
+ 	struct mmu_notifier_range range;
+ 	struct mmu_gather tlb;
+ 
+-	lru_add_drain();
+ 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
+ 				address, end);
+ 	hugetlb_zap_begin(vma, &range.start, &range.end);
+--- a/mm/mmap.c
+++ b/mm/mmap.c
+@@ -1931,7 +1931,6 @@ void exit_mmap(struct mm_struct *mm)
+ 		goto destroy;
+ 	}
+ 
+-	lru_add_drain();
+ 	flush_cache_mm(mm);
+ 	tlb_gather_mmu_fullmm(&tlb, mm);
+ 	/* update_hiwater_rss(mm) here? but nobody should be looking */
+@@ -2374,7 +2373,6 @@ int relocate_vma_down(struct vm_area_str
+ 				       vma, new_start, length, false, true))
+ 		return -ENOMEM;
+ 
+-	lru_add_drain();
+ 	tlb_gather_mmu(&tlb, mm);
+ 	next = vma_next(&vmi);
+ 	if (new_end > old_start) {
+--- a/mm/swap_state.c
+++ b/mm/swap_state.c
+@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en
+ 	struct folio_batch folios;
+ 	unsigned int refs[PAGEVEC_SIZE];
+ 
+-	lru_add_drain();
+ 	folio_batch_init(&folios);
+ 	for (int i = 0; i < nr; i++) {
+ 		struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
+--- a/mm/vma.c
+++ b/mm/vma.c
+@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas,
+ 	struct mm_struct *mm = vma->vm_mm;
+ 	struct mmu_gather tlb;
+ 
+-	lru_add_drain();
+ 	tlb_gather_mmu(&tlb, mm);
+ 	update_hiwater_rss(mm);
+ 	unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
+@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct
+ 	 * were isolated before we downgraded mmap_lock.
+ 	 */
+ 	mas_set(mas_detach, 1);
+-	lru_add_drain();
+ 	tlb_gather_mmu(&tlb, vms->vma->vm_mm);
+ 	update_hiwater_rss(vms->vma->vm_mm);
+ 	unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
--- a/debian/patches/patchset-zen/invlpgb-v9/0014-vdso-Introduce-vdso-page.h.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0014-vdso-Introduce-vdso-page.h.patch
@@ -0,0 +1,429 @@
+From 7ecab5a83d3155baa009cd6bc6e18959fee8be62 Mon Sep 17 00:00:00 2001
+From: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Date: Mon, 14 Oct 2024 16:13:39 +0100
+Subject: vdso: Introduce vdso/page.h
+
+The VDSO implementation includes headers from outside of the
+vdso/ namespace.
+
+Introduce vdso/page.h to make sure that the generic library
+uses only the allowed namespace.
+
+Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Arnd Bergmann <arnd@arndb.de>
+Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k
+Link: https://lore.kernel.org/all/20241014151340.1639555-3-vincenzo.frascino@arm.com
+---
+ arch/alpha/include/asm/page.h      |  6 +-----
+ arch/arc/include/uapi/asm/page.h   |  7 +++----
+ arch/arm/include/asm/page.h        |  5 +----
+ arch/arm64/include/asm/page-def.h  |  5 +----
+ arch/csky/include/asm/page.h       |  8 ++------
+ arch/hexagon/include/asm/page.h    |  4 +---
+ arch/loongarch/include/asm/page.h  |  7 +------
+ arch/m68k/include/asm/page.h       |  6 ++----
+ arch/microblaze/include/asm/page.h |  5 +----
+ arch/mips/include/asm/page.h       |  7 +------
+ arch/nios2/include/asm/page.h      |  7 +------
+ arch/openrisc/include/asm/page.h   | 11 +----------
+ arch/parisc/include/asm/page.h     |  4 +---
+ arch/powerpc/include/asm/page.h    | 10 +---------
+ arch/riscv/include/asm/page.h      |  4 +---
+ arch/s390/include/asm/page.h       | 13 +++++--------
+ arch/sh/include/asm/page.h         |  6 ++----
+ arch/sparc/include/asm/page_32.h   |  4 +---
+ arch/sparc/include/asm/page_64.h   |  4 +---
+ arch/um/include/asm/page.h         |  5 +----
+ arch/x86/include/asm/page_types.h  |  5 +----
+ arch/xtensa/include/asm/page.h     |  8 +-------
+ include/vdso/page.h                | 30 ++++++++++++++++++++++++++++++
+ 23 files changed, 61 insertions(+), 110 deletions(-)
+ create mode 100644 include/vdso/page.h
+
+--- a/arch/alpha/include/asm/page.h
+++ b/arch/alpha/include/asm/page.h
+@@ -4,11 +4,7 @@
+ 
+ #include <linux/const.h>
+ #include <asm/pal.h>
+-
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1,UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ #ifndef __ASSEMBLY__
+ 
+--- a/arch/arc/include/uapi/asm/page.h
+++ b/arch/arc/include/uapi/asm/page.h
+@@ -14,7 +14,7 @@
+ 
+ /* PAGE_SHIFT determines the page size */
+ #ifdef __KERNEL__
+-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
+#include <vdso/page.h>
+ #else
+ /*
+  * Default 8k
+@@ -24,11 +24,10 @@
+  * not available
+  */
+ #define PAGE_SHIFT 13
+#define PAGE_SIZE	_BITUL(PAGE_SHIFT)	/* Default 8K */
+#define PAGE_MASK	(~(PAGE_SIZE-1))
+ #endif
+ 
+-#define PAGE_SIZE	_BITUL(PAGE_SHIFT)	/* Default 8K */
+ #define PAGE_OFFSET	_AC(0x80000000, UL)	/* Kernel starts at 2G onwrds */
+ 
+-#define PAGE_MASK	(~(PAGE_SIZE-1))
+-
+ #endif /* _UAPI__ASM_ARC_PAGE_H */
+--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
+@@ -7,10 +7,7 @@
+ #ifndef _ASMARM_PAGE_H
+ #define _ASMARM_PAGE_H
+ 
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT		CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE		(_AC(1,UL) << PAGE_SHIFT)
+-#define PAGE_MASK		(~((1 << PAGE_SHIFT) - 1))
+#include <vdso/page.h>
+ 
+ #ifndef __ASSEMBLY__
+ 
+--- a/arch/arm64/include/asm/page-def.h
+++ b/arch/arm64/include/asm/page-def.h
+@@ -10,9 +10,6 @@
+ 
+ #include <linux/const.h>
+ 
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT		CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE		(_AC(1, UL) << PAGE_SHIFT)
+-#define PAGE_MASK		(~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ #endif /* __ASM_PAGE_DEF_H */
+--- a/arch/csky/include/asm/page.h
+++ b/arch/csky/include/asm/page.h
+@@ -7,12 +7,8 @@
+ #include <asm/cache.h>
+ #include <linux/const.h>
+ 
+-/*
+- * PAGE_SHIFT determines the page size: 4KB
+- */
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE - 1))
+#include <vdso/page.h>
+
+ #define THREAD_SIZE	(PAGE_SIZE * 2)
+ #define THREAD_MASK	(~(THREAD_SIZE - 1))
+ #define THREAD_SHIFT	(PAGE_SHIFT + 1)
+--- a/arch/hexagon/include/asm/page.h
+++ b/arch/hexagon/include/asm/page.h
+@@ -45,9 +45,7 @@
+ #define HVM_HUGEPAGE_SIZE 0x5
+ #endif
+ 
+-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE  (1UL << PAGE_SHIFT)
+-#define PAGE_MASK  (~((1 << PAGE_SHIFT) - 1))
+#include <vdso/page.h>
+ 
+ #ifdef __KERNEL__
+ #ifndef __ASSEMBLY__
+--- a/arch/loongarch/include/asm/page.h
+++ b/arch/loongarch/include/asm/page.h
+@@ -8,12 +8,7 @@
+ #include <linux/const.h>
+ #include <asm/addrspace.h>
+ 
+-/*
+- * PAGE_SHIFT determines the page size
+- */
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE - 1))
+#include <vdso/page.h>
+ 
+ #define HPAGE_SHIFT	(PAGE_SHIFT + PAGE_SHIFT - 3)
+ #define HPAGE_SIZE	(_AC(1, UL) << HPAGE_SHIFT)
+--- a/arch/m68k/include/asm/page.h
+++ b/arch/m68k/include/asm/page.h
+@@ -6,10 +6,8 @@
+ #include <asm/setup.h>
+ #include <asm/page_offset.h>
+ 
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE-1))
+#include <vdso/page.h>
+
+ #define PAGE_OFFSET	(PAGE_OFFSET_RAW)
+ 
+ #ifndef __ASSEMBLY__
+--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
+@@ -19,10 +19,7 @@
+ 
+ #ifdef __KERNEL__
+ 
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(ASM_CONST(1) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ #define LOAD_OFFSET	ASM_CONST((CONFIG_KERNEL_START-CONFIG_KERNEL_BASE_ADDR))
+ 
+--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
+@@ -14,12 +14,7 @@
+ #include <linux/kernel.h>
+ #include <asm/mipsregs.h>
+ 
+-/*
+- * PAGE_SHIFT determines the page size
+- */
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1,UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~((1 << PAGE_SHIFT) - 1))
+#include <vdso/page.h>
+ 
+ /*
+  * This is used for calculating the real page sizes
+--- a/arch/nios2/include/asm/page.h
+++ b/arch/nios2/include/asm/page.h
+@@ -18,12 +18,7 @@
+ #include <linux/pfn.h>
+ #include <linux/const.h>
+ 
+-/*
+- * PAGE_SHIFT determines the page size
+- */
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE - 1))
+#include <vdso/page.h>
+ 
+ /*
+  * PAGE_OFFSET -- the first address of the first page of memory.
+--- a/arch/openrisc/include/asm/page.h
+++ b/arch/openrisc/include/asm/page.h
+@@ -15,16 +15,7 @@
+ #ifndef __ASM_OPENRISC_PAGE_H
+ #define __ASM_OPENRISC_PAGE_H
+ 
+-
+-/* PAGE_SHIFT determines the page size */
+-
+-#define PAGE_SHIFT      CONFIG_PAGE_SHIFT
+-#ifdef __ASSEMBLY__
+-#define PAGE_SIZE       (1 << PAGE_SHIFT)
+-#else
+-#define PAGE_SIZE       (1UL << PAGE_SHIFT)
+-#endif
+-#define PAGE_MASK       (~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ #define PAGE_OFFSET	0xc0000000
+ #define KERNELBASE	PAGE_OFFSET
+--- a/arch/parisc/include/asm/page.h
+++ b/arch/parisc/include/asm/page.h
+@@ -4,9 +4,7 @@
+ 
+ #include <linux/const.h>
+ 
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1,UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+ 
+--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
+@@ -21,8 +21,7 @@
+  * page size. When using 64K pages however, whether we are really supporting
+  * 64K pages in HW or not is irrelevant to those definitions.
+  */
+-#define PAGE_SHIFT		CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE		(ASM_CONST(1) << PAGE_SHIFT)
+#include <vdso/page.h>
+ 
+ #ifndef __ASSEMBLY__
+ #ifndef CONFIG_HUGETLB_PAGE
+@@ -42,13 +41,6 @@ extern unsigned int hpage_shift;
+ #endif
+ 
+ /*
+- * Subtle: (1 << PAGE_SHIFT) is an int, not an unsigned long. So if we
+- * assign PAGE_MASK to a larger type it gets extended the way we want
+- * (i.e. with 1s in the high bits)
+- */
+-#define PAGE_MASK      (~((1 << PAGE_SHIFT) - 1))
+-
+-/*
+  * KERNELBASE is the virtual address of the start of the kernel, it's often
+  * the same as PAGE_OFFSET, but _might not be_.
+  *
+--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
+@@ -12,9 +12,7 @@
+ #include <linux/pfn.h>
+ #include <linux/const.h>
+ 
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE - 1))
+#include <vdso/page.h>
+ 
+ #define HPAGE_SHIFT		PMD_SHIFT
+ #define HPAGE_SIZE		(_AC(1, UL) << HPAGE_SHIFT)
+--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
+@@ -11,14 +11,11 @@
+ #include <linux/const.h>
+ #include <asm/types.h>
+ 
+-#define _PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define _PAGE_SIZE	(_AC(1, UL) << _PAGE_SHIFT)
+-#define _PAGE_MASK	(~(_PAGE_SIZE - 1))
+#include <vdso/page.h>
+ 
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT	_PAGE_SHIFT
+-#define PAGE_SIZE	_PAGE_SIZE
+-#define PAGE_MASK	_PAGE_MASK
+#define _PAGE_SHIFT	PAGE_SHIFT
+#define _PAGE_SIZE	PAGE_SIZE
+#define _PAGE_MASK	PAGE_MASK
+ #define PAGE_DEFAULT_ACC	_AC(0, UL)
+ /* storage-protection override */
+ #define PAGE_SPO_ACC		9
+--- a/arch/sh/include/asm/page.h
+++ b/arch/sh/include/asm/page.h
+@@ -8,10 +8,8 @@
+ 
+ #include <linux/const.h>
+ 
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE-1))
+#include <vdso/page.h>
+
+ #define PTE_MASK	PAGE_MASK
+ 
+ #if defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
+--- a/arch/sparc/include/asm/page_32.h
+++ b/arch/sparc/include/asm/page_32.h
+@@ -11,9 +11,7 @@
+ 
+ #include <linux/const.h>
+ 
+-#define PAGE_SHIFT   CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE    (_AC(1, UL) << PAGE_SHIFT)
+-#define PAGE_MASK    (~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ #ifndef __ASSEMBLY__
+ 
+--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
+@@ -4,9 +4,7 @@
+ 
+ #include <linux/const.h>
+ 
+-#define PAGE_SHIFT   CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE    (_AC(1,UL) << PAGE_SHIFT)
+-#define PAGE_MASK    (~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ /* Flushing for D-cache alias handling is only needed if
+  * the page size is smaller than 16K.
+--- a/arch/um/include/asm/page.h
+++ b/arch/um/include/asm/page.h
+@@ -9,10 +9,7 @@
+ 
+ #include <linux/const.h>
+ 
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ #ifndef __ASSEMBLY__
+ 
+--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
+@@ -6,10 +6,7 @@
+ #include <linux/types.h>
+ #include <linux/mem_encrypt.h>
+ 
+-/* PAGE_SHIFT determines the page size */
+-#define PAGE_SHIFT		CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE		(_AC(1,UL) << PAGE_SHIFT)
+-#define PAGE_MASK		(~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ #define __VIRTUAL_MASK		((1UL << __VIRTUAL_MASK_SHIFT) - 1)
+ 
+--- a/arch/xtensa/include/asm/page.h
+++ b/arch/xtensa/include/asm/page.h
+@@ -18,13 +18,7 @@
+ #include <asm/cache.h>
+ #include <asm/kmem_layout.h>
+ 
+-/*
+- * PAGE_SHIFT determines the page size
+- */
+-
+-#define PAGE_SHIFT	CONFIG_PAGE_SHIFT
+-#define PAGE_SIZE	(__XTENSA_UL_CONST(1) << PAGE_SHIFT)
+-#define PAGE_MASK	(~(PAGE_SIZE-1))
+#include <vdso/page.h>
+ 
+ #ifdef CONFIG_MMU
+ #define PAGE_OFFSET	XCHAL_KSEG_CACHED_VADDR
+--- /dev/null
+++ b/include/vdso/page.h
+@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_PAGE_H
+#define __VDSO_PAGE_H
+
+#include <uapi/linux/const.h>
+
+/*
+ * PAGE_SHIFT determines the page size.
+ *
+ * Note: This definition is required because PAGE_SHIFT is used
+ * in several places throuout the codebase.
+ */
+#define PAGE_SHIFT      CONFIG_PAGE_SHIFT
+
+#define PAGE_SIZE	(_AC(1,UL) << CONFIG_PAGE_SHIFT)
+
+#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT)
+/*
+ * Applies only to 32-bit architectures with a 64-bit phys_addr_t.
+ *
+ * Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long.
+ * So if we assign PAGE_MASK to a larger type it gets extended the
+ * way we want (i.e. with 1s in the high bits)
+ */
+#define PAGE_MASK	(~((1 << CONFIG_PAGE_SHIFT) - 1))
+#else
+#define PAGE_MASK	(~(PAGE_SIZE - 1))
+#endif
+
+#endif	/* __VDSO_PAGE_H */
--- a/debian/patches/patchset-zen/invlpgb-v9/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch
+++ b/debian/patches/patchset-zen/invlpgb-v9/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch
@@ -0,0 +1,68 @@
+From d1bcf51400e790e65945a29078bd816bd61aa148 Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Thu, 24 Oct 2024 13:34:26 +0000
+Subject: vdso: Change PAGE_MASK to signed on all 32-bit architectures
+
+With the introduction of an architecture-independent defintion of
+PAGE_MASK, we had to make a choice between defining it as 'unsigned long'
+as on 64-bit architectures, or as signed 'long' as required for
+architectures with a 64-bit phys_addr_t.
+
+To reduce the risk for regressions and minimize the changes in behavior,
+the result was using the signed value only when CONFIG_PHYS_ADDR_T_64BIT
+is set, but that ended up causing a regression after all in the
+early_init_dt_add_memory_arch() function that uses 64-bit integers for
+address calculation.
+
+Presumably the same regression also affects mips32 and powerpc32 when
+dealing with large amounts of memory on DT platforms: like arm32, they were
+using the signed version unconditionally.
+
+The two most sensible options for addressing the regression are either to
+go back to an architecture specific definition, using a signed constant on
+arm/powerpc/mips and unsigned on the others, or to use the same definition
+everywhere.
+
+Use the simpler of those two and change them all to the signed version, in
+the hope that this does not cause a different type of bug. Most of the
+other 32-bit architectures have no large physical address support and are
+rarely used, so it seems more likely that using the same definition helps
+than hurts here.
+
+In particular, x86-32 does have physical addressing extensions, so it
+already changed to the signed version after the previous patch, so it makes
+sense to use the same version on non-PAE as well.
+
+Fixes: efe8419ae78d ("vdso: Introduce vdso/page.h")
+Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Anders Roxell <anders.roxell@linaro.org>
+Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Link: https://lore.kernel.org/all/20241024133447.3117273-1-arnd@kernel.org
+Link: https://lore.kernel.org/lkml/CA+G9fYt86bUAu_v5dXPWnDUwQNVipj+Wq3Djir1KUSKdr9QLNg@mail.gmail.com/
+---
+ include/vdso/page.h | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/include/vdso/page.h
+++ b/include/vdso/page.h
+@@ -14,13 +14,14 @@
+ 
+ #define PAGE_SIZE	(_AC(1,UL) << CONFIG_PAGE_SHIFT)
+ 
+-#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT)
+#if !defined(CONFIG_64BIT)
+ /*
+- * Applies only to 32-bit architectures with a 64-bit phys_addr_t.
+ * Applies only to 32-bit architectures.
+  *
+  * Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long.
+  * So if we assign PAGE_MASK to a larger type it gets extended the
+- * way we want (i.e. with 1s in the high bits)
+ * way we want (i.e. with 1s in the high bits) while masking a
+ * 64-bit value such as phys_addr_t.
+  */
+ #define PAGE_MASK	(~((1 << CONFIG_PAGE_SHIFT) - 1))
+ #else