From 3fb1083e44f2dc696e388764349d0d92b4b70fd9 Mon Sep 17 00:00:00 2001
From: Konstantin Demin <rockdrilla@gmail.com>
Date: Thu, 2 Jan 2025 21:18:06 +0300
Subject: [PATCH] release 6.12.8

---
 debian/bin/genpatch-pfkernel                  |   2 +-
 debian/changelog                              |   7 +
 debian/config/amd64/config.cloud              |   7 -
 debian/config/amd64/config.mobile             |   4 -
 debian/config/amd64/config.vm                 |   7 -
 debian/config/config                          |  11 +-
 ...egacy-client-tracking-initialization.patch |  33 ++
 ...nprivileged-CLONE_NEWUSER-by-default.patch |   4 +-
 ..._GATHER_RCU_TABLE_FREE-unconditional.patch |  60 +++
 ...ove-pv_ops.mmu.tlb_remove_table-call.patch | 137 +++++
 ...m-add-X86_FEATURE_INVLPGB-definition.patch |  23 +
 ...-mm-get-INVLPGB-count-max-from-CPUID.patch |  57 ++
 ...0005-x86-mm-add-INVLPGB-support-code.patch | 121 +++++
 ...m-use-INVLPGB-for-kernel-TLB-flushes.patch |  61 +++
 ...x86-tlb-use-INVLPGB-in-flush_tlb_all.patch |  28 +
 ...cast-TLB-flushing-for-page-reclaim-T.patch |  36 ++
 ...oadcast-TLB-invalidation-for-multi-t.patch | 508 ++++++++++++++++++
 ...ted-broadcast-flushing-from-tlbbatch.patch | 126 +++++
 ...ble-AMD-translation-cache-extensions.patch |  82 +++
 ...lidate-final-translations-with-INVLP.patch |  28 +
 ...e-unnecessary-calls-to-lru_add_drain.patch |  92 ++++
 ...d-missing-statement-in-resume_phase3.patch |  27 +
 debian/patches/series                         |  16 +
 23 files changed, 1455 insertions(+), 22 deletions(-)
 create mode 100644 debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
 create mode 100644 debian/patches/patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
 create mode 100644 debian/patches/patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch

diff --git a/debian/bin/genpatch-pfkernel b/debian/bin/genpatch-pfkernel
index ee762b8..2cb330e 100755
--- a/debian/bin/genpatch-pfkernel
+++ b/debian/bin/genpatch-pfkernel
@@ -7,7 +7,7 @@ w=$(git rev-parse --path-format=absolute --show-toplevel) ; : "${w:?}" ; cd "$w"
 
 dst='debian/patches/pf-tmp'
 src='../linux-extras'
-branches='amd-pstate amd-rapl cpuidle crypto fixes kbuild pksm xfs zstd'
+branches='amd-pstate amd-rapl cpuidle crypto fixes invlpgb kbuild pksm xfs zstd'
 
 if [ -d "${dst}" ] ; then rm -rf "${dst}" ; fi
 mkdir -p "${dst}"
diff --git a/debian/changelog b/debian/changelog
index d5bc7ca..935142d 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+linux (6.12.8-1) sid; urgency=medium
+
+  * New upstream stable update:
+    https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.12.8
+
+ -- Konstantin Demin <rockdrilla@gmail.com>  Thu, 02 Jan 2025 19:34:34 +0300
+
 linux (6.12.7-1) sid; urgency=medium
 
   * New upstream stable update:
diff --git a/debian/config/amd64/config.cloud b/debian/config/amd64/config.cloud
index fb8e5ae..78b6db3 100644
--- a/debian/config/amd64/config.cloud
+++ b/debian/config/amd64/config.cloud
@@ -1384,9 +1384,6 @@ CONFIG_BLK_DEV_PMEM=m
 # CONFIG_NVME_HWMON is not set
 # CONFIG_NVME_RDMA is not set
 CONFIG_NVME_FC=m
-CONFIG_NVME_TCP=m
-# CONFIG_NVME_TCP_TLS is not set
-# CONFIG_NVME_HOST_AUTH is not set
 
 ##
 ## file: drivers/nvme/target/Kconfig
@@ -1394,8 +1391,6 @@ CONFIG_NVME_TCP=m
 CONFIG_NVME_TARGET_RDMA=m
 CONFIG_NVME_TARGET_FC=m
 # CONFIG_NVME_TARGET_FCLOOP is not set
-CONFIG_NVME_TARGET_TCP=m
-# CONFIG_NVME_TARGET_TCP_TLS is not set
 
 ##
 ## file: drivers/of/Kconfig
@@ -2495,8 +2490,6 @@ CONFIG_KEXEC_CORE=y
 CONFIG_LZ4HC_COMPRESS=m
 CONFIG_LZ4_COMPRESS=m
 CONFIG_MFD_CORE=m
-CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
-CONFIG_MMU_GATHER_TABLE_FREE=y
 CONFIG_ND_BTT=m
 CONFIG_ND_PFN=m
 CONFIG_NETFS_SUPPORT=m
diff --git a/debian/config/amd64/config.mobile b/debian/config/amd64/config.mobile
index 21cb54a..edfbf5d 100644
--- a/debian/config/amd64/config.mobile
+++ b/debian/config/amd64/config.mobile
@@ -4892,14 +4892,11 @@ CONFIG_OF_PMEM=y
 ##
 CONFIG_NVME_HWMON=y
 # CONFIG_NVME_FC is not set
-# CONFIG_NVME_TCP is not set
-CONFIG_NVME_HOST_AUTH=y
 
 ##
 ## file: drivers/nvme/target/Kconfig
 ##
 # CONFIG_NVME_TARGET_FC is not set
-# CONFIG_NVME_TARGET_TCP is not set
 
 ##
 ## file: drivers/nvmem/Kconfig
@@ -8756,7 +8753,6 @@ CONFIG_NFC_ST_NCI=m
 CONFIG_NF_NAT_AMANDA=m
 CONFIG_NLS_UCS2_UTILS=m
 CONFIG_NVMEM_LAYOUTS=y
-CONFIG_NVME_AUTH=m
 CONFIG_OF_ADDRESS=y
 CONFIG_OF_EARLY_FLATTREE=y
 CONFIG_OF_FLATTREE=y
diff --git a/debian/config/amd64/config.vm b/debian/config/amd64/config.vm
index 6274efa..26e2257 100644
--- a/debian/config/amd64/config.vm
+++ b/debian/config/amd64/config.vm
@@ -2355,9 +2355,6 @@ CONFIG_BLK_DEV_PMEM=m
 CONFIG_NVME_HWMON=y
 CONFIG_NVME_RDMA=m
 CONFIG_NVME_FC=m
-CONFIG_NVME_TCP=m
-# CONFIG_NVME_TCP_TLS is not set
-# CONFIG_NVME_HOST_AUTH is not set
 
 ##
 ## file: drivers/nvme/target/Kconfig
@@ -2365,8 +2362,6 @@ CONFIG_NVME_TCP=m
 CONFIG_NVME_TARGET_RDMA=m
 CONFIG_NVME_TARGET_FC=m
 # CONFIG_NVME_TARGET_FCLOOP is not set
-CONFIG_NVME_TARGET_TCP=m
-# CONFIG_NVME_TARGET_TCP_TLS is not set
 
 ##
 ## file: drivers/of/Kconfig
@@ -4068,8 +4063,6 @@ CONFIG_LZ4_COMPRESS=m
 CONFIG_MAPPING_DIRTY_HELPERS=y
 CONFIG_MCTP_FLOWS=y
 CONFIG_MFD_CORE=m
-CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
-CONFIG_MMU_GATHER_TABLE_FREE=y
 CONFIG_MOUSE_PS2_SMBUS=y
 CONFIG_ND_BTT=m
 CONFIG_ND_PFN=m
diff --git a/debian/config/config b/debian/config/config
index 5a27038..57062d2 100644
--- a/debian/config/config
+++ b/debian/config/config
@@ -1098,6 +1098,9 @@ CONFIG_NVDIMM_DAX=y
 CONFIG_BLK_DEV_NVME=m
 CONFIG_NVME_MULTIPATH=y
 CONFIG_NVME_VERBOSE_ERRORS=y
+CONFIG_NVME_TCP=m
+CONFIG_NVME_TCP_TLS=y
+CONFIG_NVME_HOST_AUTH=y
 
 ##
 ## file: drivers/nvme/target/Kconfig
@@ -1106,7 +1109,9 @@ CONFIG_NVME_TARGET=m
 # CONFIG_NVME_TARGET_DEBUGFS is not set
 CONFIG_NVME_TARGET_PASSTHRU=y
 CONFIG_NVME_TARGET_LOOP=m
-# CONFIG_NVME_TARGET_AUTH is not set
+CONFIG_NVME_TARGET_TCP=m
+CONFIG_NVME_TARGET_TCP_TLS=y
+CONFIG_NVME_TARGET_AUTH=y
 
 ##
 ## file: drivers/nvmem/Kconfig
@@ -3941,6 +3946,8 @@ CONFIG_MLX4_CORE=m
 CONFIG_MMCONF_FAM10H=y
 CONFIG_MMU=y
 CONFIG_MMU_GATHER_MERGE_VMAS=y
+CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
+CONFIG_MMU_GATHER_TABLE_FREE=y
 CONFIG_MMU_LAZY_TLB_REFCOUNT=y
 CONFIG_MMU_NOTIFIER=y
 CONFIG_MODULES_TREE_LOOKUP=y
@@ -4015,8 +4022,10 @@ CONFIG_NR_CPUS_RANGE_END=512
 CONFIG_NUMA_KEEP_MEMINFO=y
 CONFIG_NUMA_MEMBLKS=y
 CONFIG_NVDIMM_KEYS=y
+CONFIG_NVME_AUTH=m
 CONFIG_NVME_CORE=m
 CONFIG_NVME_FABRICS=m
+CONFIG_NVME_KEYRING=m
 CONFIG_OBJTOOL=y
 CONFIG_OID_REGISTRY=y
 CONFIG_OLD_SIGSUSPEND3=y
diff --git a/debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch b/debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch
new file mode 100644
index 0000000..0d890b5
--- /dev/null
+++ b/debian/patches/bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch
@@ -0,0 +1,33 @@
+From: Scott Mayhew <smayhew@redhat.com>
+Date: Tue, 10 Dec 2024 07:25:54 -0500
+Subject: nfsd: fix legacy client tracking initialization
+Origin: https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git/commit/?h=nfsd-next&id=45cd8c0c13fe5c9f1b926bd307df431f8f1b8a16
+Bug: https://bugzilla.kernel.org/show_bug.cgi?id=219580
+Bug-Debian: https://bugs.debian.org/1087900
+
+Get rid of the nfsd4_legacy_tracking_ops->init() call in
+check_for_legacy_methods().  That will be handled in the caller
+(nfsd4_client_tracking_init()).  Otherwise, we'll wind up calling
+nfsd4_legacy_tracking_ops->init() twice, and the second time we'll
+trigger the BUG_ON() in nfsd4_init_recdir().
+
+Fixes: 74fd48739d04 ("nfsd: new Kconfig option for legacy client tracking")
+Reported-by: Jur van der Burg <jur@avtware.com>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=219580
+Signed-off-by: Scott Mayhew <smayhew@redhat.com>
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+---
+ fs/nfsd/nfs4recover.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/fs/nfsd/nfs4recover.c
++++ b/fs/nfsd/nfs4recover.c
+@@ -2052,7 +2052,6 @@ static inline int check_for_legacy_metho
+ 		path_put(&path);
+ 		if (status)
+ 			return -ENOTDIR;
+-		status = nn->client_tracking_ops->init(net);
+ 	}
+ 	return status;
+ }
diff --git a/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch b/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch
index 3b75384..4ba7b9f 100644
--- a/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch
+++ b/debian/patches/debian/add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by-default.patch
@@ -34,7 +34,7 @@ Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
  /*
   * Minimum number of threads to boot the kernel
   */
-@@ -2158,6 +2164,10 @@ __latent_entropy struct task_struct *cop
+@@ -2157,6 +2163,10 @@ __latent_entropy struct task_struct *cop
  	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
  		return ERR_PTR(-EINVAL);
  
@@ -45,7 +45,7 @@ Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
  	/*
  	 * Thread groups must share signals as well, and detached threads
  	 * can only be started up within the thread group.
-@@ -3311,6 +3321,12 @@ int ksys_unshare(unsigned long unshare_f
+@@ -3310,6 +3320,12 @@ int ksys_unshare(unsigned long unshare_f
  	if (unshare_flags & CLONE_NEWNS)
  		unshare_flags |= CLONE_FS;
  
diff --git a/debian/patches/patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch b/debian/patches/patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
new file mode 100644
index 0000000..1fba26d
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
@@ -0,0 +1,60 @@
+From 60fbdd9e9dc7074d4cd30ada3ba9547d5c007702 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:02 -0500
+Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional
+
+Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
+paravirt, and not when running on bare metal.
+
+There is no real good reason to do things differently for
+each setup. Make them all the same.
+
+After this change, the synchronization between get_user_pages_fast
+and page table freeing is handled by RCU, which prevents page tables
+from being reused for other data while get_user_pages_fast is walking
+them.
+
+This allows us to invalidate page tables while other CPUs have
+interrupts disabled.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Suggested-by: Peter Zijlstra <peterz@infradead.org>
+---
+ arch/x86/Kconfig           | 2 +-
+ arch/x86/kernel/paravirt.c | 7 +------
+ 2 files changed, 2 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -270,7 +270,7 @@ config X86
+ 	select HAVE_PCI
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
+-	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
++	select MMU_GATHER_RCU_TABLE_FREE
+ 	select MMU_GATHER_MERGE_VMAS
+ 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
+--- a/arch/x86/kernel/paravirt.c
++++ b/arch/x86/kernel/paravirt.c
+@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void)
+ 		static_branch_enable(&virt_spin_lock_key);
+ }
+ 
+-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	tlb_remove_page(tlb, table);
+-}
+-
+ struct static_key paravirt_steal_enabled;
+ struct static_key paravirt_steal_rq_enabled;
+ 
+@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops =
+ 	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
+ 	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
+ 	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
+-	.mmu.tlb_remove_table	= native_tlb_remove_table,
++	.mmu.tlb_remove_table	= tlb_remove_table,
+ 
+ 	.mmu.exit_mmap		= paravirt_nop,
+ 	.mmu.notify_page_enc_status_changed	= paravirt_nop,
diff --git a/debian/patches/patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch b/debian/patches/patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
new file mode 100644
index 0000000..4178de2
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
@@ -0,0 +1,137 @@
+From 8966aff4928c0bc3aa79b8729d74da5ea782f73a Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:03 -0500
+Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call
+
+Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
+
+Get rid of the indirection by simply calling tlb_remove_table directly,
+and not going through the paravirt function pointers.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
+---
+ arch/x86/hyperv/mmu.c                 |  1 -
+ arch/x86/include/asm/paravirt.h       |  5 -----
+ arch/x86/include/asm/paravirt_types.h |  2 --
+ arch/x86/kernel/kvm.c                 |  1 -
+ arch/x86/kernel/paravirt.c            |  1 -
+ arch/x86/mm/pgtable.c                 | 16 ++++------------
+ arch/x86/xen/mmu_pv.c                 |  1 -
+ 7 files changed, 4 insertions(+), 23 deletions(-)
+
+--- a/arch/x86/hyperv/mmu.c
++++ b/arch/x86/hyperv/mmu.c
+@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void)
+ 
+ 	pr_info("Using hypercall for remote TLB flush\n");
+ 	pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
+-	pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ }
+--- a/arch/x86/include/asm/paravirt.h
++++ b/arch/x86/include/asm/paravirt.h
+@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
+ 	PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
+ }
+ 
+-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
+-}
+-
+ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
+ {
+ 	PVOP_VCALL1(mmu.exit_mmap, mm);
+--- a/arch/x86/include/asm/paravirt_types.h
++++ b/arch/x86/include/asm/paravirt_types.h
+@@ -136,8 +136,6 @@ struct pv_mmu_ops {
+ 	void (*flush_tlb_multi)(const struct cpumask *cpus,
+ 				const struct flush_tlb_info *info);
+ 
+-	void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
+-
+ 	/* Hook for intercepting the destruction of an mm_struct. */
+ 	void (*exit_mmap)(struct mm_struct *mm);
+ 	void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
+--- a/arch/x86/kernel/kvm.c
++++ b/arch/x86/kernel/kvm.c
+@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
+ #ifdef CONFIG_SMP
+ 	if (pv_tlb_flush_supported()) {
+ 		pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
+-		pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ 		pr_info("KVM setup pv remote TLB flush\n");
+ 	}
+ 
+--- a/arch/x86/kernel/paravirt.c
++++ b/arch/x86/kernel/paravirt.c
+@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops =
+ 	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
+ 	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
+ 	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
+-	.mmu.tlb_remove_table	= tlb_remove_table,
+ 
+ 	.mmu.exit_mmap		= paravirt_nop,
+ 	.mmu.notify_page_enc_status_changed	= paravirt_nop,
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask);
+ #define PGTABLE_HIGHMEM 0
+ #endif
+ 
+-#ifndef CONFIG_PARAVIRT
+-static inline
+-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	tlb_remove_page(tlb, table);
+-}
+-#endif
+-
+ gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
+ 
+ pgtable_t pte_alloc_one(struct mm_struct *mm)
+@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather *
+ {
+ 	pagetable_pte_dtor(page_ptdesc(pte));
+ 	paravirt_release_pte(page_to_pfn(pte));
+-	paravirt_tlb_remove_table(tlb, pte);
++	tlb_remove_table(tlb, pte);
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 2
+@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather *
+ 	tlb->need_flush_all = 1;
+ #endif
+ 	pagetable_pmd_dtor(ptdesc);
+-	paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
++	tlb_remove_table(tlb, ptdesc_page(ptdesc));
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 3
+@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather *
+ 
+ 	pagetable_pud_dtor(ptdesc);
+ 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+-	paravirt_tlb_remove_table(tlb, virt_to_page(pud));
++	tlb_remove_table(tlb, virt_to_page(pud));
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 4
+ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+ {
+ 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+-	paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
++	tlb_remove_table(tlb, virt_to_page(p4d));
+ }
+ #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
+ #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
+--- a/arch/x86/xen/mmu_pv.c
++++ b/arch/x86/xen/mmu_pv.c
+@@ -2137,7 +2137,6 @@ static const typeof(pv_ops) xen_mmu_ops
+ 		.flush_tlb_kernel = xen_flush_tlb,
+ 		.flush_tlb_one_user = xen_flush_tlb_one_user,
+ 		.flush_tlb_multi = xen_flush_tlb_multi,
+-		.tlb_remove_table = tlb_remove_table,
+ 
+ 		.pgd_alloc = xen_pgd_alloc,
+ 		.pgd_free = xen_pgd_free,
diff --git a/debian/patches/patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch b/debian/patches/patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch
new file mode 100644
index 0000000..cb5131a
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch
@@ -0,0 +1,23 @@
+From efde57842082e36ab2e2be5a11c7b06ff9e18b3d Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:04 -0500
+Subject: x86/mm: add X86_FEATURE_INVLPGB definition.
+
+Add the INVPLGB CPUID definition, allowing the kernel to recognize
+whether the CPU supports the INVLPGB instruction.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/include/asm/cpufeatures.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -335,6 +335,7 @@
+ #define X86_FEATURE_CLZERO		(13*32+ 0) /* "clzero" CLZERO instruction */
+ #define X86_FEATURE_IRPERF		(13*32+ 1) /* "irperf" Instructions Retired Count */
+ #define X86_FEATURE_XSAVEERPTR		(13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
++#define X86_FEATURE_INVLPGB		(13*32+ 3) /* "invlpgb" INVLPGB instruction */
+ #define X86_FEATURE_RDPRU		(13*32+ 4) /* "rdpru" Read processor register at user level */
+ #define X86_FEATURE_WBNOINVD		(13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
+ #define X86_FEATURE_AMD_IBPB		(13*32+12) /* Indirect Branch Prediction Barrier */
diff --git a/debian/patches/patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch b/debian/patches/patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
new file mode 100644
index 0000000..fc2995c
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
@@ -0,0 +1,57 @@
+From 98953e10e342ceea1dc877cfb63318fa85879a59 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:05 -0500
+Subject: x86/mm: get INVLPGB count max from CPUID
+
+The CPU advertises the maximum number of pages that can be shot down
+with one INVLPGB instruction in the CPUID data.
+
+Save that information for later use.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/include/asm/tlbflush.h | 1 +
+ arch/x86/kernel/cpu/amd.c       | 8 ++++++++
+ arch/x86/kernel/setup.c         | 4 ++++
+ 3 files changed, 13 insertions(+)
+
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -182,6 +182,7 @@ static inline void cr4_init_shadow(void)
+ 
+ extern unsigned long mmu_cr4_features;
+ extern u32 *trampoline_cr4_features;
++extern u16 invlpgb_count_max;
+ 
+ extern void initialize_tlbstate_and_flush(void);
+ 
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -1135,6 +1135,14 @@ static void cpu_detect_tlb_amd(struct cp
+ 		tlb_lli_2m[ENTRIES] = eax & mask;
+ 
+ 	tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
++
++	if (c->extended_cpuid_level < 0x80000008)
++		return;
++
++	cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
++
++	/* Max number of pages INVLPGB can invalidate in one shot */
++	invlpgb_count_max = (edx & 0xffff) + 1;
+ }
+ 
+ static const struct cpu_dev amd_cpu_dev = {
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -138,6 +138,10 @@ __visible unsigned long mmu_cr4_features
+ __visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
+ #endif
+ 
++#ifdef CONFIG_CPU_SUP_AMD
++u16 invlpgb_count_max __ro_after_init;
++#endif
++
+ #ifdef CONFIG_IMA
+ static phys_addr_t ima_kexec_buffer_phys;
+ static size_t ima_kexec_buffer_size;
diff --git a/debian/patches/patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch b/debian/patches/patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch
new file mode 100644
index 0000000..f116ad6
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch
@@ -0,0 +1,121 @@
+From bc9d1fa1bd32dca78f38bd2a8557e7fc638308bd Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:06 -0500
+Subject: x86/mm: add INVLPGB support code
+
+Add invlpgb.h with the helper functions and definitions needed to use
+broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/include/asm/invlpgb.h  | 93 +++++++++++++++++++++++++++++++++
+ arch/x86/include/asm/tlbflush.h |  1 +
+ 2 files changed, 94 insertions(+)
+ create mode 100644 arch/x86/include/asm/invlpgb.h
+
+--- /dev/null
++++ b/arch/x86/include/asm/invlpgb.h
+@@ -0,0 +1,93 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _ASM_X86_INVLPGB
++#define _ASM_X86_INVLPGB
++
++#include <vdso/bits.h>
++
++/*
++ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
++ *
++ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
++ * be done in a parallel fashion.
++ *
++ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
++ * this CPU have completed.
++ */
++static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr,
++			    int extra_count, bool pmd_stride, unsigned long flags)
++{
++	u64 rax = addr | flags;
++	u32 ecx = (pmd_stride << 31) | extra_count;
++	u32 edx = (pcid << 16) | asid;
++
++	asm volatile("invlpgb" : : "a" (rax), "c" (ecx), "d" (edx));
++}
++
++/*
++ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
++ * of the three. For example:
++ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address
++ * - INVLPGB_PCID:              	  invalidate all TLB entries matching the PCID
++ *
++ * The first can be used to invalidate (kernel) mappings at a particular
++ * address across all processes.
++ *
++ * The latter invalidates all TLB entries matching a PCID.
++ */
++#define INVLPGB_VA			BIT(0)
++#define INVLPGB_PCID			BIT(1)
++#define INVLPGB_ASID			BIT(2)
++#define INVLPGB_INCLUDE_GLOBAL		BIT(3)
++#define INVLPGB_FINAL_ONLY		BIT(4)
++#define INVLPGB_INCLUDE_NESTED		BIT(5)
++
++/* Flush all mappings for a given pcid and addr, not including globals. */
++static inline void invlpgb_flush_user(unsigned long pcid,
++				      unsigned long addr)
++{
++	__invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
++}
++
++static inline void invlpgb_flush_user_nr(unsigned long pcid, unsigned long addr,
++					 int nr, bool pmd_stride)
++{
++	__invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
++}
++
++/* Flush all mappings for a given ASID, not including globals. */
++static inline void invlpgb_flush_single_asid(unsigned long asid)
++{
++	__invlpgb(asid, 0, 0, 0, 0, INVLPGB_ASID);
++}
++
++/* Flush all mappings for a given PCID, not including globals. */
++static inline void invlpgb_flush_single_pcid(unsigned long pcid)
++{
++	__invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
++}
++
++/* Flush all mappings, including globals, for all PCIDs. */
++static inline void invlpgb_flush_all(void)
++{
++	__invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
++}
++
++/* Flush addr, including globals, for all PCIDs. */
++static inline void invlpgb_flush_addr(unsigned long addr, int nr)
++{
++	__invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
++}
++
++/* Flush all mappings for all PCIDs except globals. */
++static inline void invlpgb_flush_all_nonglobals(void)
++{
++	__invlpgb(0, 0, 0, 0, 0, 0);
++}
++
++/* Wait for INVLPGB originated by this CPU to complete. */
++static inline void tlbsync(void)
++{
++	asm volatile("tlbsync");
++}
++
++#endif /* _ASM_X86_INVLPGB */
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -10,6 +10,7 @@
+ #include <asm/cpufeature.h>
+ #include <asm/special_insns.h>
+ #include <asm/smp.h>
++#include <asm/invlpgb.h>
+ #include <asm/invpcid.h>
+ #include <asm/pti.h>
+ #include <asm/processor-flags.h>
diff --git a/debian/patches/patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch b/debian/patches/patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
new file mode 100644
index 0000000..56a6df3
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
@@ -0,0 +1,61 @@
+From ffd834c7140dc5fcaf96161c6d8c4601bb700afe Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:07 -0500
+Subject: x86/mm: use INVLPGB for kernel TLB flushes
+
+Use broadcast TLB invalidation for kernel addresses when available.
+
+This stops us from having to send IPIs for kernel TLB flushes.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/mm/tlb.c | 31 +++++++++++++++++++++++++++++++
+ 1 file changed, 31 insertions(+)
+
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -1048,6 +1048,32 @@ void flush_tlb_all(void)
+ 	on_each_cpu(do_flush_tlb_all, NULL, 1);
+ }
+ 
++static void broadcast_kernel_range_flush(unsigned long start, unsigned long end)
++{
++	unsigned long addr;
++	unsigned long maxnr = invlpgb_count_max;
++	unsigned long threshold = tlb_single_page_flush_ceiling * maxnr;
++
++	/*
++	 * TLBSYNC only waits for flushes originating on the same CPU.
++	 * Disabling migration allows us to wait on all flushes.
++	 */
++	guard(preempt)();
++
++	if (end == TLB_FLUSH_ALL ||
++	    (end - start) > threshold << PAGE_SHIFT) {
++		invlpgb_flush_all();
++	} else {
++		unsigned long nr;
++		for (addr = start; addr < end; addr += nr << PAGE_SHIFT) {
++			nr = min((end - addr) >> PAGE_SHIFT, maxnr);
++			invlpgb_flush_addr(addr, nr);
++		}
++	}
++
++	tlbsync();
++}
++
+ static void do_kernel_range_flush(void *info)
+ {
+ 	struct flush_tlb_info *f = info;
+@@ -1060,6 +1086,11 @@ static void do_kernel_range_flush(void *
+ 
+ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+ {
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
++		broadcast_kernel_range_flush(start, end);
++		return;
++	}
++
+ 	/* Balance as user space task's flush, a bit conservative */
+ 	if (end == TLB_FLUSH_ALL ||
+ 	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
diff --git a/debian/patches/patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch b/debian/patches/patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch
new file mode 100644
index 0000000..fe9ed69
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch
@@ -0,0 +1,28 @@
+From 13fac8226036456c15c517c1dd77be5109a61da2 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:08 -0500
+Subject: x86/tlb: use INVLPGB in flush_tlb_all
+
+The flush_tlb_all() function is not used a whole lot, but we might
+as well use broadcast TLB flushing there, too.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/mm/tlb.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -1045,6 +1045,12 @@ static void do_flush_tlb_all(void *info)
+ void flush_tlb_all(void)
+ {
+ 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
++		guard(preempt)();
++		invlpgb_flush_all();
++		tlbsync();
++		return;
++	}
+ 	on_each_cpu(do_flush_tlb_all, NULL, 1);
+ }
+ 
diff --git a/debian/patches/patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch b/debian/patches/patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
new file mode 100644
index 0000000..4f33505
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
@@ -0,0 +1,36 @@
+From 765d531296765e7fb2888c70cb56c0e25b459231 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:09 -0500
+Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing
+
+In the page reclaim code, we only track the CPU(s) where the TLB needs
+to be flushed, rather than all the individual mappings that may be getting
+invalidated.
+
+Use broadcast TLB flushing when that is available.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/mm/tlb.c | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -1281,8 +1281,16 @@ EXPORT_SYMBOL_GPL(__flush_tlb_all);
+ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+ {
+ 	struct flush_tlb_info *info;
++	int cpu;
+ 
+-	int cpu = get_cpu();
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
++		guard(preempt)();
++		invlpgb_flush_all_nonglobals();
++		tlbsync();
++		return;
++	}
++
++	cpu = get_cpu();
+ 
+ 	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
+ 				  TLB_GENERATION_INVALID);
diff --git a/debian/patches/patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch b/debian/patches/patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
new file mode 100644
index 0000000..7786212
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
@@ -0,0 +1,508 @@
+From 8b23125a3200a330fb407133f33aeb9ad3232603 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:10 -0500
+Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded
+ processes
+
+Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3
+and newer CPUs.
+
+In order to not exhaust PCID space, and keep TLB flushes local for single
+threaded processes, we only hand out broadcast ASIDs to processes active on
+3 or more CPUs, and gradually increase the threshold as broadcast ASID space
+is depleted.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/include/asm/mmu.h         |   6 +
+ arch/x86/include/asm/mmu_context.h |  12 ++
+ arch/x86/include/asm/tlbflush.h    |  17 ++
+ arch/x86/mm/tlb.c                  | 310 ++++++++++++++++++++++++++++-
+ 4 files changed, 336 insertions(+), 9 deletions(-)
+
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -46,6 +46,12 @@ typedef struct {
+ 	unsigned long flags;
+ #endif
+ 
++#ifdef CONFIG_CPU_SUP_AMD
++	struct list_head broadcast_asid_list;
++	u16 broadcast_asid;
++	bool asid_transition;
++#endif
++
+ #ifdef CONFIG_ADDRESS_MASKING
+ 	/* Active LAM mode:  X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
+ 	unsigned long lam_cr3_mask;
+--- a/arch/x86/include/asm/mmu_context.h
++++ b/arch/x86/include/asm/mmu_context.h
+@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
+ #define enter_lazy_tlb enter_lazy_tlb
+ extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+ 
++extern void destroy_context_free_broadcast_asid(struct mm_struct *mm);
++
+ /*
+  * Init a new mm.  Used on mm copies, like at fork()
+  * and on mm's that are brand-new, like at execve().
+@@ -160,6 +162,13 @@ static inline int init_new_context(struc
+ 		mm->context.execute_only_pkey = -1;
+ 	}
+ #endif
++
++#ifdef CONFIG_CPU_SUP_AMD
++	INIT_LIST_HEAD(&mm->context.broadcast_asid_list);
++	mm->context.broadcast_asid = 0;
++	mm->context.asid_transition = false;
++#endif
++
+ 	mm_reset_untag_mask(mm);
+ 	init_new_context_ldt(mm);
+ 	return 0;
+@@ -169,6 +178,9 @@ static inline int init_new_context(struc
+ static inline void destroy_context(struct mm_struct *mm)
+ {
+ 	destroy_context_ldt(mm);
++#ifdef CONFIG_CPU_SUP_AMD
++	destroy_context_free_broadcast_asid(mm);
++#endif
+ }
+ 
+ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -65,6 +65,23 @@ static inline void cr4_clear_bits(unsign
+  */
+ #define TLB_NR_DYN_ASIDS	6
+ 
++#ifdef CONFIG_CPU_SUP_AMD
++#define is_dyn_asid(asid) (asid) < TLB_NR_DYN_ASIDS
++#define is_broadcast_asid(asid) (asid) >= TLB_NR_DYN_ASIDS
++#define in_asid_transition(info) (info->mm && info->mm->context.asid_transition)
++#define mm_broadcast_asid(mm) (mm->context.broadcast_asid)
++#else
++#define is_dyn_asid(asid) true
++#define is_broadcast_asid(asid) false
++#define in_asid_transition(info) false
++#define mm_broadcast_asid(mm) 0
++
++inline bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid)
++{
++	return false;
++}
++#endif
++
+ struct tlb_context {
+ 	u64 ctx_id;
+ 	u64 tlb_gen;
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -74,13 +74,15 @@
+  * use different names for each of them:
+  *
+  * ASID  - [0, TLB_NR_DYN_ASIDS-1]
+- *         the canonical identifier for an mm
++ *         the canonical identifier for an mm, dynamically allocated on each CPU
++ *         [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
++ *         the canonical, global identifier for an mm, identical across all CPUs
+  *
+- * kPCID - [1, TLB_NR_DYN_ASIDS]
++ * kPCID - [1, MAX_ASID_AVAILABLE]
+  *         the value we write into the PCID part of CR3; corresponds to the
+  *         ASID+1, because PCID 0 is special.
+  *
+- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
++ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
+  *         for KPTI each mm has two address spaces and thus needs two
+  *         PCID values, but we can still do with a single ASID denomination
+  *         for each mm. Corresponds to kPCID + 2048.
+@@ -225,6 +227,18 @@ static void choose_new_asid(struct mm_st
+ 		return;
+ 	}
+ 
++	/*
++	 * TLB consistency for this ASID is maintained with INVLPGB;
++	 * TLB flushes happen even while the process isn't running.
++	 */
++#ifdef CONFIG_CPU_SUP_AMD
++	if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_broadcast_asid(next)) {
++		*new_asid = mm_broadcast_asid(next);
++		*need_flush = false;
++		return;
++	}
++#endif
++
+ 	if (this_cpu_read(cpu_tlbstate.invalidate_other))
+ 		clear_asid_other();
+ 
+@@ -251,6 +265,245 @@ static void choose_new_asid(struct mm_st
+ 	*need_flush = true;
+ }
+ 
++#ifdef CONFIG_CPU_SUP_AMD
++/*
++ * Logic for AMD INVLPGB support.
++ */
++static DEFINE_RAW_SPINLOCK(broadcast_asid_lock);
++static u16 last_broadcast_asid = TLB_NR_DYN_ASIDS;
++static DECLARE_BITMAP(broadcast_asid_used, MAX_ASID_AVAILABLE) = { 0 };
++static LIST_HEAD(broadcast_asid_list);
++static int broadcast_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
++
++static void reset_broadcast_asid_space(void)
++{
++	mm_context_t *context;
++
++	lockdep_assert_held(&broadcast_asid_lock);
++
++	/*
++	 * Flush once when we wrap around the ASID space, so we won't need
++	 * to flush every time we allocate an ASID for boradcast flushing.
++	 */
++	invlpgb_flush_all_nonglobals();
++	tlbsync();
++
++	/*
++	 * Leave the currently used broadcast ASIDs set in the bitmap, since
++	 * those cannot be reused before the next wraparound and flush..
++	 */
++	bitmap_clear(broadcast_asid_used, 0, MAX_ASID_AVAILABLE);
++	list_for_each_entry(context, &broadcast_asid_list, broadcast_asid_list)
++		__set_bit(context->broadcast_asid, broadcast_asid_used);
++
++	last_broadcast_asid = TLB_NR_DYN_ASIDS;
++}
++
++static u16 get_broadcast_asid(void)
++{
++	lockdep_assert_held(&broadcast_asid_lock);
++
++	do {
++		u16 start = last_broadcast_asid;
++		u16 asid = find_next_zero_bit(broadcast_asid_used, MAX_ASID_AVAILABLE, start);
++
++		if (asid >= MAX_ASID_AVAILABLE) {
++			reset_broadcast_asid_space();
++			continue;
++		}
++
++		/* Try claiming this broadcast ASID. */
++		if (!test_and_set_bit(asid, broadcast_asid_used)) {
++			last_broadcast_asid = asid;
++			return asid;
++		}
++	} while (1);
++}
++
++/*
++ * Returns true if the mm is transitioning from a CPU-local ASID to a broadcast
++ * (INVLPGB) ASID, or the other way around.
++ */
++static bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid)
++{
++	u16 broadcast_asid = mm_broadcast_asid(next);
++
++	if (broadcast_asid && prev_asid != broadcast_asid)
++		return true;
++
++	if (!broadcast_asid && is_broadcast_asid(prev_asid))
++		return true;
++
++	return false;
++}
++
++void destroy_context_free_broadcast_asid(struct mm_struct *mm)
++{
++	if (!mm->context.broadcast_asid)
++		return;
++
++	guard(raw_spinlock_irqsave)(&broadcast_asid_lock);
++	mm->context.broadcast_asid = 0;
++	list_del(&mm->context.broadcast_asid_list);
++	broadcast_asid_available++;
++}
++
++static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
++{
++	int count = 0;
++	int cpu;
++
++	if (cpumask_weight(mm_cpumask(mm)) <= threshold)
++		return false;
++
++	for_each_cpu(cpu, mm_cpumask(mm)) {
++		/* Skip the CPUs that aren't really running this process. */
++		if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
++			continue;
++
++		if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
++			continue;
++
++		if (++count > threshold)
++			return true;
++	}
++	return false;
++}
++
++/*
++ * Assign a broadcast ASID to the current process, protecting against
++ * races between multiple threads in the process.
++ */
++static void use_broadcast_asid(struct mm_struct *mm)
++{
++	guard(raw_spinlock_irqsave)(&broadcast_asid_lock);
++
++	/* This process is already using broadcast TLB invalidation. */
++	if (mm->context.broadcast_asid)
++		return;
++
++	mm->context.broadcast_asid = get_broadcast_asid();
++	mm->context.asid_transition = true;
++	list_add(&mm->context.broadcast_asid_list, &broadcast_asid_list);
++	broadcast_asid_available--;
++}
++
++/*
++ * Figure out whether to assign a broadcast (global) ASID to a process.
++ * We vary the threshold by how empty or full broadcast ASID space is.
++ * 1/4 full: >= 4 active threads
++ * 1/2 full: >= 8 active threads
++ * 3/4 full: >= 16 active threads
++ * 7/8 full: >= 32 active threads
++ * etc
++ *
++ * This way we should never exhaust the broadcast ASID space, even on very
++ * large systems, and the processes with the largest number of active
++ * threads should be able to use broadcast TLB invalidation.
++ */
++#define HALFFULL_THRESHOLD 8
++static bool meets_broadcast_asid_threshold(struct mm_struct *mm)
++{
++	int avail = broadcast_asid_available;
++	int threshold = HALFFULL_THRESHOLD;
++
++	if (!avail)
++		return false;
++
++	if (avail > MAX_ASID_AVAILABLE * 3 / 4) {
++		threshold = HALFFULL_THRESHOLD / 4;
++	} else if (avail > MAX_ASID_AVAILABLE / 2) {
++		threshold = HALFFULL_THRESHOLD / 2;
++	} else if (avail < MAX_ASID_AVAILABLE / 3) {
++		do {
++			avail *= 2;
++			threshold *= 2;
++		} while ((avail + threshold) < MAX_ASID_AVAILABLE / 2);
++	}
++
++	return mm_active_cpus_exceeds(mm, threshold);
++}
++
++static void count_tlb_flush(struct mm_struct *mm)
++{
++	if (!static_cpu_has(X86_FEATURE_INVLPGB))
++		return;
++
++	/* Check every once in a while. */
++	if ((current->pid & 0x1f) != (jiffies & 0x1f))
++		return;
++
++	if (meets_broadcast_asid_threshold(mm))
++		use_broadcast_asid(mm);
++}
++
++static void finish_asid_transition(struct flush_tlb_info *info)
++{
++	struct mm_struct *mm = info->mm;
++	int bc_asid = mm_broadcast_asid(mm);
++	int cpu;
++
++	if (!mm->context.asid_transition)
++		return;
++
++	for_each_cpu(cpu, mm_cpumask(mm)) {
++		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
++			continue;
++
++		/*
++		 * If at least one CPU is not using the broadcast ASID yet,
++		 * send a TLB flush IPI. The IPI should cause stragglers
++		 * to transition soon.
++		 */
++		if (per_cpu(cpu_tlbstate.loaded_mm_asid, cpu) != bc_asid) {
++			flush_tlb_multi(mm_cpumask(info->mm), info);
++			return;
++		}
++	}
++
++	/* All the CPUs running this process are using the broadcast ASID. */
++	mm->context.asid_transition = 0;
++}
++
++static void broadcast_tlb_flush(struct flush_tlb_info *info)
++{
++	bool pmd = info->stride_shift == PMD_SHIFT;
++	unsigned long maxnr = invlpgb_count_max;
++	unsigned long asid = info->mm->context.broadcast_asid;
++	unsigned long addr = info->start;
++	unsigned long nr;
++
++	/* Flushing multiple pages at once is not supported with 1GB pages. */
++	if (info->stride_shift > PMD_SHIFT)
++		maxnr = 1;
++
++	if (info->end == TLB_FLUSH_ALL) {
++		invlpgb_flush_single_pcid(kern_pcid(asid));
++		/* Do any CPUs supporting INVLPGB need PTI? */
++		if (static_cpu_has(X86_FEATURE_PTI))
++			invlpgb_flush_single_pcid(user_pcid(asid));
++	} else do {
++		/*
++		 * Calculate how many pages can be flushed at once; if the
++		 * remainder of the range is less than one page, flush one.
++		 */
++		nr = min(maxnr, (info->end - addr) >> info->stride_shift);
++		nr = max(nr, 1);
++
++		invlpgb_flush_user_nr(kern_pcid(asid), addr, nr, pmd);
++		/* Do any CPUs supporting INVLPGB need PTI? */
++		if (static_cpu_has(X86_FEATURE_PTI))
++			invlpgb_flush_user_nr(user_pcid(asid), addr, nr, pmd);
++		addr += nr << info->stride_shift;
++	} while (addr < info->end);
++
++	finish_asid_transition(info);
++
++	/* Wait for the INVLPGBs kicked off above to finish. */
++	tlbsync();
++}
++#endif /* CONFIG_CPU_SUP_AMD */
++
+ /*
+  * Given an ASID, flush the corresponding user ASID.  We can delay this
+  * until the next time we switch to it.
+@@ -556,8 +809,9 @@ void switch_mm_irqs_off(struct mm_struct
+ 	 */
+ 	if (prev == next) {
+ 		/* Not actually switching mm's */
+-		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+-			   next->context.ctx_id);
++		if (is_dyn_asid(prev_asid))
++			VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
++				   next->context.ctx_id);
+ 
+ 		/*
+ 		 * If this races with another thread that enables lam, 'new_lam'
+@@ -574,6 +828,23 @@ void switch_mm_irqs_off(struct mm_struct
+ 			cpumask_set_cpu(cpu, mm_cpumask(next));
+ 
+ 		/*
++		 * Check if the current mm is transitioning to a new ASID.
++		 */
++		if (needs_broadcast_asid_reload(next, prev_asid)) {
++			next_tlb_gen = atomic64_read(&next->context.tlb_gen);
++
++			choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
++			goto reload_tlb;
++		}
++
++		/*
++		 * Broadcast TLB invalidation keeps this PCID up to date
++		 * all the time.
++		 */
++		if (is_broadcast_asid(prev_asid))
++			return;
++
++		/*
+ 		 * If the CPU is not in lazy TLB mode, we are just switching
+ 		 * from one thread in a process to another thread in the same
+ 		 * process. No TLB flush required.
+@@ -629,8 +900,10 @@ void switch_mm_irqs_off(struct mm_struct
+ 		barrier();
+ 	}
+ 
++reload_tlb:
+ 	new_lam = mm_lam_cr3_mask(next);
+ 	if (need_flush) {
++		VM_BUG_ON(is_broadcast_asid(new_asid));
+ 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ 		load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
+@@ -749,7 +1022,7 @@ static void flush_tlb_func(void *info)
+ 	const struct flush_tlb_info *f = info;
+ 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ 	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+-	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
++	u64 local_tlb_gen;
+ 	bool local = smp_processor_id() == f->initiating_cpu;
+ 	unsigned long nr_invalidate = 0;
+ 	u64 mm_tlb_gen;
+@@ -769,6 +1042,16 @@ static void flush_tlb_func(void *info)
+ 	if (unlikely(loaded_mm == &init_mm))
+ 		return;
+ 
++	/* Reload the ASID if transitioning into or out of a broadcast ASID */
++	if (needs_broadcast_asid_reload(loaded_mm, loaded_mm_asid)) {
++		switch_mm_irqs_off(NULL, loaded_mm, NULL);
++		loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
++	}
++
++	/* Broadcast ASIDs are always kept up to date with INVLPGB. */
++	if (is_broadcast_asid(loaded_mm_asid))
++		return;
++
+ 	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+ 		   loaded_mm->context.ctx_id);
+ 
+@@ -786,6 +1069,8 @@ static void flush_tlb_func(void *info)
+ 		return;
+ 	}
+ 
++	local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
++
+ 	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
+ 		     f->new_tlb_gen <= local_tlb_gen)) {
+ 		/*
+@@ -926,7 +1211,7 @@ STATIC_NOPV void native_flush_tlb_multi(
+ 	 * up on the new contents of what used to be page tables, while
+ 	 * doing a speculative memory access.
+ 	 */
+-	if (info->freed_tables)
++	if (info->freed_tables || in_asid_transition(info))
+ 		on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
+ 	else
+ 		on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+@@ -998,14 +1283,18 @@ void flush_tlb_mm_range(struct mm_struct
+ 				bool freed_tables)
+ {
+ 	struct flush_tlb_info *info;
++	unsigned long threshold = tlb_single_page_flush_ceiling;
+ 	u64 new_tlb_gen;
+ 	int cpu;
+ 
++	if (static_cpu_has(X86_FEATURE_INVLPGB))
++		threshold *= invlpgb_count_max;
++
+ 	cpu = get_cpu();
+ 
+ 	/* Should we flush just the requested range? */
+ 	if ((end == TLB_FLUSH_ALL) ||
+-	    ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
++	    ((end - start) >> stride_shift) > threshold) {
+ 		start = 0;
+ 		end = TLB_FLUSH_ALL;
+ 	}
+@@ -1021,8 +1310,11 @@ void flush_tlb_mm_range(struct mm_struct
+ 	 * a local TLB flush is needed. Optimize this use-case by calling
+ 	 * flush_tlb_func_local() directly in this case.
+ 	 */
+-	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
++	if (IS_ENABLED(CONFIG_CPU_SUP_AMD) && mm_broadcast_asid(mm)) {
++		broadcast_tlb_flush(info);
++	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ 		flush_tlb_multi(mm_cpumask(mm), info);
++		count_tlb_flush(mm);
+ 	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ 		lockdep_assert_irqs_enabled();
+ 		local_irq_disable();
diff --git a/debian/patches/patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch b/debian/patches/patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch
new file mode 100644
index 0000000..42e7f79
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch
@@ -0,0 +1,126 @@
+From 1767a2786ebbe3451f973df44485309c2a8fd8a5 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:11 -0500
+Subject: x86,tlb: do targeted broadcast flushing from tlbbatch code
+
+Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
+queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
+
+This also allows us to avoid adding the CPUs of processes using broadcast
+flushing to the batch->cpumask, and will hopefully further reduce TLB
+flushing from the reclaim and compaction paths.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/include/asm/tlbbatch.h |  1 +
+ arch/x86/include/asm/tlbflush.h | 12 +++------
+ arch/x86/mm/tlb.c               | 48 ++++++++++++++++++++++++++-------
+ 3 files changed, 42 insertions(+), 19 deletions(-)
+
+--- a/arch/x86/include/asm/tlbbatch.h
++++ b/arch/x86/include/asm/tlbbatch.h
+@@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch {
+ 	 * the PFNs being flushed..
+ 	 */
+ 	struct cpumask cpumask;
++	bool used_invlpgb;
+ };
+ 
+ #endif /* _ARCH_X86_TLBBATCH_H */
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -296,21 +296,15 @@ static inline u64 inc_mm_tlb_gen(struct
+ 	return atomic64_inc_return(&mm->context.tlb_gen);
+ }
+ 
+-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+-					     struct mm_struct *mm,
+-					     unsigned long uaddr)
+-{
+-	inc_mm_tlb_gen(mm);
+-	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+-}
+-
+ static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+ {
+ 	flush_tlb_mm(mm);
+ }
+ 
+ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
++extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
++					     struct mm_struct *mm,
++					     unsigned long uaddr);
+ 
+ static inline bool pte_flags_need_flush(unsigned long oldflags,
+ 					unsigned long newflags,
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -1573,16 +1573,7 @@ EXPORT_SYMBOL_GPL(__flush_tlb_all);
+ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+ {
+ 	struct flush_tlb_info *info;
+-	int cpu;
+-
+-	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+-		guard(preempt)();
+-		invlpgb_flush_all_nonglobals();
+-		tlbsync();
+-		return;
+-	}
+-
+-	cpu = get_cpu();
++	int cpu = get_cpu();
+ 
+ 	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
+ 				  TLB_GENERATION_INVALID);
+@@ -1600,12 +1591,49 @@ void arch_tlbbatch_flush(struct arch_tlb
+ 		local_irq_enable();
+ 	}
+ 
++	/*
++	 * If we issued (asynchronous) INVLPGB flushes, wait for them here.
++	 * The cpumask above contains only CPUs that were running tasks
++	 * not using broadcast TLB flushing.
++	 */
++	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
++		tlbsync();
++		migrate_enable();
++		batch->used_invlpgb = false;
++	}
++
+ 	cpumask_clear(&batch->cpumask);
+ 
+ 	put_flush_tlb_info();
+ 	put_cpu();
+ }
+ 
++void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
++					     struct mm_struct *mm,
++					     unsigned long uaddr)
++{
++	if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_broadcast_asid(mm)) {
++		u16 asid = mm_broadcast_asid(mm);
++		/*
++		 * Queue up an asynchronous invalidation. The corresponding
++		 * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
++		 * on the same CPU.
++		 */
++		if (!batch->used_invlpgb) {
++			batch->used_invlpgb = true;
++			migrate_disable();
++		}
++		invlpgb_flush_user_nr(kern_pcid(asid), uaddr, 1, 0);
++		/* Do any CPUs supporting INVLPGB need PTI? */
++		if (static_cpu_has(X86_FEATURE_PTI))
++			invlpgb_flush_user_nr(user_pcid(asid), uaddr, 1, 0);
++	} else {
++		inc_mm_tlb_gen(mm);
++		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
++	}
++	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
++}
++
+ /*
+  * Blindly accessing user memory from NMI context can be dangerous
+  * if we're in the middle of switching the current user task or
diff --git a/debian/patches/patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch b/debian/patches/patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
new file mode 100644
index 0000000..b72f506
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
@@ -0,0 +1,82 @@
+From 13faf551d1a146ed18c448babe1953def4ed3d56 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:12 -0500
+Subject: x86/mm: enable AMD translation cache extensions
+
+With AMD TCE (translation cache extensions) only the intermediate mappings
+that cover the address range zapped by INVLPG / INVLPGB get invalidated,
+rather than all intermediate mappings getting zapped at every TLB invalidation.
+
+This can help reduce the TLB miss rate, by keeping more intermediate
+mappings in the cache.
+
+>From the AMD manual:
+
+Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit
+to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on
+TLB entries. When this bit is 0, these instructions remove the target PTE
+from the TLB as well as all upper-level table entries that are cached
+in the TLB, whether or not they are associated with the target PTE.
+When this bit is set, these instructions will remove the target PTE and
+only those upper-level entries that lead to the target PTE in
+the page table hierarchy, leaving unrelated upper-level entries intact.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/kernel/cpu/amd.c |  8 ++++++++
+ arch/x86/mm/tlb.c         | 10 +++++++---
+ 2 files changed, 15 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -1143,6 +1143,14 @@ static void cpu_detect_tlb_amd(struct cp
+ 
+ 	/* Max number of pages INVLPGB can invalidate in one shot */
+ 	invlpgb_count_max = (edx & 0xffff) + 1;
++
++	/* If supported, enable translation cache extensions (TCE) */
++	cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
++	if (ecx & BIT(17)) {
++		u64 msr = native_read_msr(MSR_EFER);;
++		msr |= BIT(15);
++		wrmsrl(MSR_EFER, msr);
++	}
+ }
+ 
+ static const struct cpu_dev amd_cpu_dev = {
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -477,7 +477,7 @@ static void broadcast_tlb_flush(struct f
+ 	if (info->stride_shift > PMD_SHIFT)
+ 		maxnr = 1;
+ 
+-	if (info->end == TLB_FLUSH_ALL) {
++	if (info->end == TLB_FLUSH_ALL || info->freed_tables) {
+ 		invlpgb_flush_single_pcid(kern_pcid(asid));
+ 		/* Do any CPUs supporting INVLPGB need PTI? */
+ 		if (static_cpu_has(X86_FEATURE_PTI))
+@@ -1110,7 +1110,7 @@ static void flush_tlb_func(void *info)
+ 	 *
+ 	 * The only question is whether to do a full or partial flush.
+ 	 *
+-	 * We do a partial flush if requested and two extra conditions
++	 * We do a partial flush if requested and three extra conditions
+ 	 * are met:
+ 	 *
+ 	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
+@@ -1137,10 +1137,14 @@ static void flush_tlb_func(void *info)
+ 	 *    date.  By doing a full flush instead, we can increase
+ 	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
+ 	 *    avoid another flush in the very near future.
++	 *
++	 * 3. No page tables were freed. If page tables were freed, a full
++	 *    flush ensures intermediate translations in the TLB get flushed.
+ 	 */
+ 	if (f->end != TLB_FLUSH_ALL &&
+ 	    f->new_tlb_gen == local_tlb_gen + 1 &&
+-	    f->new_tlb_gen == mm_tlb_gen) {
++	    f->new_tlb_gen == mm_tlb_gen &&
++	    !f->freed_tables) {
+ 		/* Partial flush */
+ 		unsigned long addr = f->start;
+ 
diff --git a/debian/patches/patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch b/debian/patches/patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
new file mode 100644
index 0000000..7feb629
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
@@ -0,0 +1,28 @@
+From 2fc0be5fbcee1a62162b699451bb94f90ec64244 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Mon, 30 Dec 2024 12:53:13 -0500
+Subject: x86/mm: only invalidate final translations with INVLPGB
+
+Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB.
+This way only leaf mappings get removed from the TLB, leaving intermediate
+translations cached.
+
+On the (rare) occasions where we free page tables we do a full flush,
+ensuring intermediate translations get flushed from the TLB.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+---
+ arch/x86/include/asm/invlpgb.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/invlpgb.h
++++ b/arch/x86/include/asm/invlpgb.h
+@@ -51,7 +51,7 @@ static inline void invlpgb_flush_user(un
+ static inline void invlpgb_flush_user_nr(unsigned long pcid, unsigned long addr,
+ 					 int nr, bool pmd_stride)
+ {
+-	__invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
++	__invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA | INVLPGB_FINAL_ONLY);
+ }
+ 
+ /* Flush all mappings for a given ASID, not including globals. */
diff --git a/debian/patches/patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch b/debian/patches/patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
new file mode 100644
index 0000000..f2714b6
--- /dev/null
+++ b/debian/patches/patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
@@ -0,0 +1,92 @@
+From a3ff46a157cadb29349c5b388fc70804c351e561 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Thu, 19 Dec 2024 15:32:53 -0500
+Subject: mm: remove unnecessary calls to lru_add_drain
+
+There seem to be several categories of calls to lru_add_drain
+and lru_add_drain_all.
+
+The first are code paths that recently allocated, swapped in,
+or otherwise processed a batch of pages, and want them all on
+the LRU. These drain pages that were recently allocated,
+probably on the local CPU.
+
+A second category are code paths that are actively trying to
+reclaim, migrate, or offline memory. These often use lru_add_drain_all,
+to drain the caches on all CPUs.
+
+However, there also seem to be some other callers where we
+aren't really doing either. They are calling lru_add_drain(),
+despite operating on pages that may have been allocated
+long ago, and quite possibly on different CPUs.
+
+Those calls are not likely to be effective at anything but
+creating lock contention on the LRU locks.
+
+Remove the lru_add_drain calls in the latter category.
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Suggested-by: David Hildenbrand <david@redhat.com>
+---
+ mm/memory.c     | 1 -
+ mm/mmap.c       | 2 --
+ mm/swap_state.c | 1 -
+ mm/vma.c        | 2 --
+ 4 files changed, 6 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are
+ 	struct mmu_notifier_range range;
+ 	struct mmu_gather tlb;
+ 
+-	lru_add_drain();
+ 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
+ 				address, end);
+ 	hugetlb_zap_begin(vma, &range.start, &range.end);
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1927,7 +1927,6 @@ void exit_mmap(struct mm_struct *mm)
+ 		goto destroy;
+ 	}
+ 
+-	lru_add_drain();
+ 	flush_cache_mm(mm);
+ 	tlb_gather_mmu_fullmm(&tlb, mm);
+ 	/* update_hiwater_rss(mm) here? but nobody should be looking */
+@@ -2370,7 +2369,6 @@ int relocate_vma_down(struct vm_area_str
+ 				       vma, new_start, length, false, true))
+ 		return -ENOMEM;
+ 
+-	lru_add_drain();
+ 	tlb_gather_mmu(&tlb, mm);
+ 	next = vma_next(&vmi);
+ 	if (new_end > old_start) {
+--- a/mm/swap_state.c
++++ b/mm/swap_state.c
+@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en
+ 	struct folio_batch folios;
+ 	unsigned int refs[PAGEVEC_SIZE];
+ 
+-	lru_add_drain();
+ 	folio_batch_init(&folios);
+ 	for (int i = 0; i < nr; i++) {
+ 		struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
+--- a/mm/vma.c
++++ b/mm/vma.c
+@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas,
+ 	struct mm_struct *mm = vma->vm_mm;
+ 	struct mmu_gather tlb;
+ 
+-	lru_add_drain();
+ 	tlb_gather_mmu(&tlb, mm);
+ 	update_hiwater_rss(mm);
+ 	unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
+@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct
+ 	 * were isolated before we downgraded mmap_lock.
+ 	 */
+ 	mas_set(mas_detach, 1);
+-	lru_add_drain();
+ 	tlb_gather_mmu(&tlb, vms->vma->vm_mm);
+ 	update_hiwater_rss(vms->vma->vm_mm);
+ 	unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
diff --git a/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch b/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch
new file mode 100644
index 0000000..45edbcc
--- /dev/null
+++ b/debian/patches/patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch
@@ -0,0 +1,27 @@
+From 6f554b20207f69146c07be3743b115e42f443627 Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
+Date: Fri, 27 Dec 2024 15:08:09 +0100
+Subject: drm/amdgpu: Add missing statement in resume_phase3
+
+Fixes: 73dae652dcac776296890da215ee7dec357a1032
+See: https://gitlab.freedesktop.org/drm/amd/-/issues/3853#note_2714815
+For: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/101
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+index 29b8346b..cbca5fa7 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+@@ -3723,6 +3723,7 @@ static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
+ 			r = adev->ip_blocks[i].version->funcs->resume(adev);
+ 			if (r)
+ 				return r;
++			adev->ip_blocks[i].status.hw = true;
+ 		}
+ 	}
+ 
+-- 
+2.45.2
+
diff --git a/debian/patches/series b/debian/patches/series
index bcf56fc..ea49e6e 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -72,6 +72,7 @@ features/x86/x86-make-x32-syscall-support-conditional.patch
 bugfix/all/disable-some-marvell-phys.patch
 bugfix/all/fs-add-module_softdep-declarations-for-hard-coded-cr.patch
 bugfix/all/documentation-use-relative-source-paths-in-abi-documentation.patch
+bugfix/all/nfsd-fix-legacy-client-tracking-initialization.patch
 
 # Miscellaneous features
 
@@ -231,6 +232,20 @@ patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.pat
 patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
 patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
 
+patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
+patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
+patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch
+patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
+patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch
+patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
+patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch
+patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
+patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
+patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch
+patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
+patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
+patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
+
 patchset-pf/pksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch
 
 patchset-pf/xfs/0001-xfs-fix-chown-with-rt-quota.patch
@@ -308,3 +323,4 @@ patchset-pf/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patc
 patchset-pf/fixes/0003-USB-core-Disable-LPM-only-for-non-suspended-ports.patch
 
 patchset-zen/fixes/0001-futex-improve-user-space-accesses.patch
+patchset-zen/fixes/0002-drm-amdgpu-Add-missing-statement-in-resume_phase3.patch