release 6.14.7

2025-05-18 12:32:18 +03:00
parent 5c06d65ded
commit a5a8a2798d
48 changed files with 215 additions and 737 deletions
--- a/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-Replace-all-AMD_CPPC_-macros-with.patch
+++ b/debian/patches/patchset-pf/amd-pstate/0017-cpufreq-amd-pstate-Replace-all-AMD_CPPC_-macros-with.patch
@@ -17,7 +17,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>

 --- a/arch/x86/include/asm/msr-index.h
 +++ b/arch/x86/include/asm/msr-index.h
-@@ -701,15 +701,17 @@
+@@ -709,15 +709,17 @@
 #define MSR_AMD_CPPC_REQ		0xc00102b3
 #define MSR_AMD_CPPC_STATUS		0xc00102b4
 
--- a/debian/patches/patchset-pf/fixes/0007-Kconfig-switch-CONFIG_SYSFS_SYCALL-default-to-n.patch
+++ b/debian/patches/patchset-pf/fixes/0007-Kconfig-switch-CONFIG_SYSFS_SYCALL-default-to-n.patch
@@ -13,7 +13,7 @@ Signed-off-by: Christian Brauner <brauner@kernel.org>

 --- a/init/Kconfig
 +++ b/init/Kconfig
-@@ -1600,6 +1600,16 @@ config SYSCTL_ARCH_UNALIGN_ALLOW
+@@ -1603,6 +1603,16 @@ config SYSCTL_ARCH_UNALIGN_ALLOW
 	  the unaligned access emulation.
 	  see arch/parisc/kernel/unaligned.c for reference
 
@@ -30,7 +30,7 @@ Signed-off-by: Christian Brauner <brauner@kernel.org>
 config HAVE_PCSPKR_PLATFORM
 	bool
 
-@@ -1644,16 +1654,6 @@ config SGETMASK_SYSCALL
+@@ -1647,16 +1657,6 @@ config SGETMASK_SYSCALL
 
 	  If unsure, leave the default option here.
 
--- a/debian/patches/patchset-pf/fixes/0012-Revert-drm-amd-Stop-evicting-resources-on-APUs-in-su.patch
+++ b/debian/patches/patchset-pf/fixes/0012-Revert-drm-amd-Stop-evicting-resources-on-APUs-in-su.patch
@@ -1,98 +0,0 @@
-From dca14df8b269f207ac834149126964039142b596 Mon Sep 17 00:00:00 2001
-From: Alex Deucher <alexander.deucher@amd.com>
-Date: Thu, 1 May 2025 13:00:16 -0400
-Subject: Revert "drm/amd: Stop evicting resources on APUs in suspend"
-
-This reverts commit 3a9626c816db901def438dc2513622e281186d39.
-
-This breaks S4 because we end up setting the s3/s0ix flags
-even when we are entering s4 since prepare is used by both
-flows.  The causes both the S3/s0ix and s4 flags to be set
-which breaks several checks in the driver which assume they
-are mutually exclusive.
-
-Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3634
-Cc: Mario Limonciello <mario.limonciello@amd.com>
-Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
-Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
- drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 --
- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c   | 18 ------------------
- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++---------
- 3 files changed, 2 insertions(+), 29 deletions(-)
-
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
-@@ -1594,11 +1594,9 @@ static inline void amdgpu_acpi_get_backl
- #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
- bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev);
- bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev);
-void amdgpu_choose_low_power_state(struct amdgpu_device *adev);
- #else
- static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; }
- static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return false; }
-static inline void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { }
- #endif
- 
- void amdgpu_register_gpu_instance(struct amdgpu_device *adev);
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
-@@ -1533,22 +1533,4 @@ bool amdgpu_acpi_is_s0ix_active(struct a
- #endif /* CONFIG_AMD_PMC */
- }
- 
-/**
- * amdgpu_choose_low_power_state
- *
- * @adev: amdgpu_device_pointer
- *
- * Choose the target low power state for the GPU
- */
-void amdgpu_choose_low_power_state(struct amdgpu_device *adev)
-{
-	if (adev->in_runpm)
-		return;
-
-	if (amdgpu_acpi_is_s0ix_active(adev))
-		adev->in_s0ix = true;
-	else if (amdgpu_acpi_is_s3_active(adev))
-		adev->in_s3 = true;
-}
-
- #endif /* CONFIG_SUSPEND */
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
-@@ -4861,15 +4861,13 @@ int amdgpu_device_prepare(struct drm_dev
- 	struct amdgpu_device *adev = drm_to_adev(dev);
- 	int i, r;
- 
-	amdgpu_choose_low_power_state(adev);
-
- 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
- 		return 0;
- 
- 	/* Evict the majority of BOs before starting suspend sequence */
- 	r = amdgpu_device_evict_resources(adev);
- 	if (r)
-		goto unprepare;
-+		return r;
- 
- 	flush_delayed_work(&adev->gfx.gfx_off_delay_work);
- 
-@@ -4880,15 +4878,10 @@ int amdgpu_device_prepare(struct drm_dev
- 			continue;
- 		r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
- 		if (r)
-			goto unprepare;
-+			return r;
- 	}
- 
- 	return 0;
-
-unprepare:
-	adev->in_s0ix = adev->in_s3 = adev->in_s4 = false;
-
-	return r;
- }
- 
- /**
--- a/debian/patches/patchset-pf/fixes/0013-drm-amdgpu-fix-pm-notifier-handling.patch
+++ b/debian/patches/patchset-pf/fixes/0013-drm-amdgpu-fix-pm-notifier-handling.patch
@@ -1,87 +0,0 @@
-From e9ee1b0a41166033eda14d11823826b79ce5131b Mon Sep 17 00:00:00 2001
-From: Alex Deucher <alexander.deucher@amd.com>
-Date: Thu, 1 May 2025 13:46:46 -0400
-Subject: drm/amdgpu: fix pm notifier handling
-
-Set the s3/s0ix and s4 flags in the pm notifier so that we can skip
-the resource evictions properly in pm prepare based on whether
-we are suspending or hibernating.  Drop the eviction as processes
-are not frozen at this time, we we can end up getting stuck trying
-to evict VRAM while applications continue to submit work which
-causes the buffers to get pulled back into VRAM.
-
-v2: Move suspend flags out of pm notifier (Mario)
-
-Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4178
-Fixes: 2965e6355dcd ("drm/amd: Add Suspend/Hibernate notification callback support")
-Cc: Mario Limonciello <mario.limonciello@amd.com>
-Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
-Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 +++++-------------
- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 10 +---------
- 2 files changed, 6 insertions(+), 22 deletions(-)
-
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
-@@ -4819,28 +4819,20 @@ static int amdgpu_device_evict_resources
-  * @data: data
-  *
-  * This function is called when the system is about to suspend or hibernate.
- * It is used to evict resources from the device before the system goes to
- * sleep while there is still access to swap.
-+ * It is used to set the appropriate flags so that eviction can be optimized
-+ * in the pm prepare callback.
-  */
- static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
- 				     void *data)
- {
- 	struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
-	int r;
- 
- 	switch (mode) {
- 	case PM_HIBERNATION_PREPARE:
- 		adev->in_s4 = true;
-		fallthrough;
-	case PM_SUSPEND_PREPARE:
-		r = amdgpu_device_evict_resources(adev);
-		/*
-		 * This is considered non-fatal at this time because
-		 * amdgpu_device_prepare() will also fatally evict resources.
-		 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
-		 */
-		if (r)
-			drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
-+		break;
-+	case PM_POST_HIBERNATION:
-+		adev->in_s4 = false;
- 		break;
- 	}
- 
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
-+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
-@@ -2592,13 +2592,8 @@ static int amdgpu_pmops_freeze(struct de
- static int amdgpu_pmops_thaw(struct device *dev)
- {
- 	struct drm_device *drm_dev = dev_get_drvdata(dev);
-	struct amdgpu_device *adev = drm_to_adev(drm_dev);
-	int r;
- 
-	r = amdgpu_device_resume(drm_dev, true);
-	adev->in_s4 = false;
-
-	return r;
-+	return amdgpu_device_resume(drm_dev, true);
- }
- 
- static int amdgpu_pmops_poweroff(struct device *dev)
-@@ -2611,9 +2606,6 @@ static int amdgpu_pmops_poweroff(struct
- static int amdgpu_pmops_restore(struct device *dev)
- {
- 	struct drm_device *drm_dev = dev_get_drvdata(dev);
-	struct amdgpu_device *adev = drm_to_adev(drm_dev);
-
-	adev->in_s4 = false;
- 
- 	return amdgpu_device_resume(drm_dev, true);
- }
--- a/debian/patches/patchset-pf/invlpgb/0001-x86-mm-Make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
+++ b/debian/patches/patchset-pf/invlpgb/0001-x86-mm-Make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
@@ -0,0 +1,149 @@
+From 2ffeb0d8d193c35403cea13d3b7273b523631007 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Thu, 13 Feb 2025 11:13:52 -0500
+Subject: x86/mm: Make MMU_GATHER_RCU_TABLE_FREE unconditional
+
+Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
+paravirt, and not when running on bare metal.
+
+There is no real good reason to do things differently for
+each setup. Make them all the same.
+
+Currently get_user_pages_fast synchronizes against page table
+freeing in two different ways:
+
+ - on bare metal, by blocking IRQs, which block TLB flush IPIs
+ - on paravirt, with MMU_GATHER_RCU_TABLE_FREE
+
+This is done because some paravirt TLB flush implementations
+handle the TLB flush in the hypervisor, and will do the flush
+even when the target CPU has interrupts disabled.
+
+Always handle page table freeing with MMU_GATHER_RCU_TABLE_FREE.
+Using RCU synchronization between page table freeing and get_user_pages_fast()
+allows bare metal to also do TLB flushing while interrupts are disabled.
+
+Various places in the mm do still block IRQs or disable preemption
+as an implicit way to block RCU frees.
+
+That makes it safe to use INVLPGB on AMD CPUs.
+
+Suggested-by: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+Tested-by: Brendan Jackman <jackmanb@google.com>
+Tested-by: Michael Kelley <mhklinux@outlook.com>
+Link: https://lore.kernel.org/r/20250213161423.449435-2-riel@surriel.com
+---
+ arch/x86/Kconfig           |  2 +-
+ arch/x86/kernel/paravirt.c | 17 +----------------
+ arch/x86/mm/pgtable.c      | 27 ++++-----------------------
+ 3 files changed, 6 insertions(+), 40 deletions(-)
+
+--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
+@@ -277,7 +277,7 @@ config X86
+ 	select HAVE_PCI
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
+-	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
+	select MMU_GATHER_RCU_TABLE_FREE
+ 	select MMU_GATHER_MERGE_VMAS
+ 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
+--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
+@@ -59,21 +59,6 @@ void __init native_pv_lock_init(void)
+ 		static_branch_enable(&virt_spin_lock_key);
+ }
+ 
+-#ifndef CONFIG_PT_RECLAIM
+-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	struct ptdesc *ptdesc = (struct ptdesc *)table;
+-
+-	pagetable_dtor(ptdesc);
+-	tlb_remove_page(tlb, ptdesc_page(ptdesc));
+-}
+-#else
+-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	tlb_remove_table(tlb, table);
+-}
+-#endif
+-
+ struct static_key paravirt_steal_enabled;
+ struct static_key paravirt_steal_rq_enabled;
+ 
+@@ -197,7 +182,7 @@ struct paravirt_patch_template pv_ops =
+ 	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
+ 	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
+ 	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
+-	.mmu.tlb_remove_table	= native_tlb_remove_table,
+	.mmu.tlb_remove_table	= tlb_remove_table,
+ 
+ 	.mmu.exit_mmap		= paravirt_nop,
+ 	.mmu.notify_page_enc_status_changed	= paravirt_nop,
+--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
+@@ -18,25 +18,6 @@ EXPORT_SYMBOL(physical_mask);
+ #define PGTABLE_HIGHMEM 0
+ #endif
+ 
+-#ifndef CONFIG_PARAVIRT
+-#ifndef CONFIG_PT_RECLAIM
+-static inline
+-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	struct ptdesc *ptdesc = (struct ptdesc *)table;
+-
+-	pagetable_dtor(ptdesc);
+-	tlb_remove_page(tlb, ptdesc_page(ptdesc));
+-}
+-#else
+-static inline
+-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	tlb_remove_table(tlb, table);
+-}
+-#endif /* !CONFIG_PT_RECLAIM */
+-#endif /* !CONFIG_PARAVIRT */
+-
+ gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
+ 
+ pgtable_t pte_alloc_one(struct mm_struct *mm)
+@@ -64,7 +45,7 @@ early_param("userpte", setup_userpte);
+ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+ {
+ 	paravirt_release_pte(page_to_pfn(pte));
+-	paravirt_tlb_remove_table(tlb, page_ptdesc(pte));
+	tlb_remove_table(tlb, page_ptdesc(pte));
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 2
+@@ -78,21 +59,21 @@ void ___pmd_free_tlb(struct mmu_gather *
+ #ifdef CONFIG_X86_PAE
+ 	tlb->need_flush_all = 1;
+ #endif
+-	paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd));
+	tlb_remove_table(tlb, virt_to_ptdesc(pmd));
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 3
+ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+ {
+ 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+-	paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud));
+	tlb_remove_table(tlb, virt_to_ptdesc(pud));
+ }
+ 
+ #if CONFIG_PGTABLE_LEVELS > 4
+ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+ {
+ 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+-	paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d));
+	tlb_remove_table(tlb, virt_to_ptdesc(p4d));
+ }
+ #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
+ #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
--- a/debian/patches/patchset-pf/invlpgb/0002-x86-mm-Remove-pv_ops.mmu.tlb_remove_table-call.patch
+++ b/debian/patches/patchset-pf/invlpgb/0002-x86-mm-Remove-pv_ops.mmu.tlb_remove_table-call.patch
@@ -0,0 +1,89 @@
+From aadea0887cca5739137f109eab0e1b38604c8af8 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Thu, 13 Feb 2025 11:13:53 -0500
+Subject: x86/mm: Remove pv_ops.mmu.tlb_remove_table call
+
+Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
+
+Get rid of the indirection by simply calling tlb_remove_table directly,
+and not going through the paravirt function pointers.
+
+Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Tested-by: Manali Shukla <Manali.Shukla@amd.com>
+Tested-by: Brendan Jackman <jackmanb@google.com>
+Tested-by: Michael Kelley <mhklinux@outlook.com>
+Link: https://lore.kernel.org/r/20250213161423.449435-3-riel@surriel.com
+---
+ arch/x86/hyperv/mmu.c                 | 1 -
+ arch/x86/include/asm/paravirt.h       | 5 -----
+ arch/x86/include/asm/paravirt_types.h | 2 --
+ arch/x86/kernel/kvm.c                 | 1 -
+ arch/x86/kernel/paravirt.c            | 1 -
+ arch/x86/xen/mmu_pv.c                 | 1 -
+ 6 files changed, 11 deletions(-)
+
+--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
+@@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void)
+ 
+ 	pr_info("Using hypercall for remote TLB flush\n");
+ 	pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
+-	pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ }
+--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
+@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
+ 	PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
+ }
+ 
+-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+-{
+-	PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
+-}
+-
+ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
+ {
+ 	PVOP_VCALL1(mmu.exit_mmap, mm);
+--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
+@@ -133,8 +133,6 @@ struct pv_mmu_ops {
+ 	void (*flush_tlb_multi)(const struct cpumask *cpus,
+ 				const struct flush_tlb_info *info);
+ 
+-	void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
+-
+ 	/* Hook for intercepting the destruction of an mm_struct. */
+ 	void (*exit_mmap)(struct mm_struct *mm);
+ 	void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
+--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
+@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
+ #ifdef CONFIG_SMP
+ 	if (pv_tlb_flush_supported()) {
+ 		pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
+-		pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ 		pr_info("KVM setup pv remote TLB flush\n");
+ 	}
+ 
+--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
+@@ -182,7 +182,6 @@ struct paravirt_patch_template pv_ops =
+ 	.mmu.flush_tlb_kernel	= native_flush_tlb_global,
+ 	.mmu.flush_tlb_one_user	= native_flush_tlb_one_user,
+ 	.mmu.flush_tlb_multi	= native_flush_tlb_multi,
+-	.mmu.tlb_remove_table	= tlb_remove_table,
+ 
+ 	.mmu.exit_mmap		= paravirt_nop,
+ 	.mmu.notify_page_enc_status_changed	= paravirt_nop,
+--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
+@@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops
+ 		.flush_tlb_kernel = xen_flush_tlb,
+ 		.flush_tlb_one_user = xen_flush_tlb_one_user,
+ 		.flush_tlb_multi = xen_flush_tlb_multi,
+-		.tlb_remove_table = tlb_remove_table,
+ 
+ 		.pgd_alloc = xen_pgd_alloc,
+ 		.pgd_free = xen_pgd_free,
--- a/debian/patches/patchset-pf/invlpgb/0003-x86-mm-Consolidate-full-flush-threshold-decision.patch
+++ b/debian/patches/patchset-pf/invlpgb/0003-x86-mm-Consolidate-full-flush-threshold-decision.patch
@@ -0,0 +1,87 @@
+From 170f37d1499a28f7a1902e007111867c7cf0147f Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Tue, 25 Feb 2025 22:00:36 -0500
+Subject: x86/mm: Consolidate full flush threshold decision
+
+Reduce code duplication by consolidating the decision point for whether to do
+individual invalidations or a full flush inside get_flush_tlb_info().
+
+Suggested-by: Dave Hansen <dave.hansen@intel.com>
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
+Acked-by: Dave Hansen <dave.hansen@intel.com>
+Link: https://lore.kernel.org/r/20250226030129.530345-2-riel@surriel.com
+---
+ arch/x86/mm/tlb.c | 41 +++++++++++++++++++----------------------
+ 1 file changed, 19 insertions(+), 22 deletions(-)
+
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -1019,6 +1019,15 @@ static struct flush_tlb_info *get_flush_
+ 	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
+ #endif
+ 
+	/*
+	 * If the number of flushes is so large that a full flush
+	 * would be faster, do a full flush.
+	 */
+	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+		start = 0;
+		end = TLB_FLUSH_ALL;
+	}
+
+ 	info->start		= start;
+ 	info->end		= end;
+ 	info->mm		= mm;
+@@ -1045,17 +1054,8 @@ void flush_tlb_mm_range(struct mm_struct
+ 				bool freed_tables)
+ {
+ 	struct flush_tlb_info *info;
+	int cpu = get_cpu();
+ 	u64 new_tlb_gen;
+-	int cpu;
+-
+-	cpu = get_cpu();
+-
+-	/* Should we flush just the requested range? */
+-	if ((end == TLB_FLUSH_ALL) ||
+-	    ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
+-		start = 0;
+-		end = TLB_FLUSH_ALL;
+-	}
+ 
+ 	/* This is also a barrier that synchronizes with switch_mm(). */
+ 	new_tlb_gen = inc_mm_tlb_gen(mm);
+@@ -1108,22 +1108,19 @@ static void do_kernel_range_flush(void *
+ 
+ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+ {
+-	/* Balance as user space task's flush, a bit conservative */
+-	if (end == TLB_FLUSH_ALL ||
+-	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
+-		on_each_cpu(do_flush_tlb_all, NULL, 1);
+-	} else {
+-		struct flush_tlb_info *info;
+	struct flush_tlb_info *info;
+ 
+-		preempt_disable();
+-		info = get_flush_tlb_info(NULL, start, end, 0, false,
+-					  TLB_GENERATION_INVALID);
+	guard(preempt)();
+ 
+	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+				  TLB_GENERATION_INVALID);
+
+	if (info->end == TLB_FLUSH_ALL)
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+	else
+ 		on_each_cpu(do_kernel_range_flush, info, 1);
+ 
+-		put_flush_tlb_info();
+-		preempt_enable();
+-	}
+	put_flush_tlb_info();
+ }
+ 
+ /*
--- a/debian/patches/patchset-pf/invlpgb/0004-x86-mm-Add-INVLPGB-feature-and-Kconfig-entry.patch
+++ b/debian/patches/patchset-pf/invlpgb/0004-x86-mm-Add-INVLPGB-feature-and-Kconfig-entry.patch
@@ -0,0 +1,103 @@
+From acb5a284db4fa3dbbb246ab8fa58da0143cd68ce Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Tue, 25 Feb 2025 22:00:37 -0500
+Subject: x86/mm: Add INVLPGB feature and Kconfig entry
+
+In addition, the CPU advertises the maximum number of pages that can be
+shot down with one INVLPGB instruction in CPUID. Save that information
+for later use.
+
+  [ bp: use cpu_has(), typos, massage. ]
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20250226030129.530345-3-riel@surriel.com
+---
+ arch/x86/Kconfig.cpu                     | 4 ++++
+ arch/x86/include/asm/cpufeatures.h       | 1 +
+ arch/x86/include/asm/disabled-features.h | 8 +++++++-
+ arch/x86/include/asm/tlbflush.h          | 3 +++
+ arch/x86/kernel/cpu/amd.c                | 6 ++++++
+ 5 files changed, 21 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
+@@ -740,6 +740,10 @@ menuconfig PROCESSOR_SELECT
+ 	  This lets you choose what x86 vendor support code your kernel
+ 	  will include.
+ 
+config BROADCAST_TLB_FLUSH
+	def_bool y
+	depends on CPU_SUP_AMD && 64BIT
+
+ config CPU_SUP_INTEL
+ 	default y
+ 	bool "Support Intel processors" if PROCESSOR_SELECT
+--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
+@@ -338,6 +338,7 @@
+ #define X86_FEATURE_CLZERO		(13*32+ 0) /* "clzero" CLZERO instruction */
+ #define X86_FEATURE_IRPERF		(13*32+ 1) /* "irperf" Instructions Retired Count */
+ #define X86_FEATURE_XSAVEERPTR		(13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_INVLPGB		(13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */
+ #define X86_FEATURE_RDPRU		(13*32+ 4) /* "rdpru" Read processor register at user level */
+ #define X86_FEATURE_WBNOINVD		(13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
+ #define X86_FEATURE_AMD_IBPB		(13*32+12) /* Indirect Branch Prediction Barrier */
+--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
+@@ -129,6 +129,12 @@
+ #define DISABLE_SEV_SNP		(1 << (X86_FEATURE_SEV_SNP & 31))
+ #endif
+ 
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+#define DISABLE_INVLPGB		0
+#else
+#define DISABLE_INVLPGB		(1 << (X86_FEATURE_INVLPGB & 31))
+#endif
+
+ /*
+  * Make sure to add features to the correct mask
+  */
+@@ -146,7 +152,7 @@
+ #define DISABLED_MASK11	(DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
+ 			 DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
+ #define DISABLED_MASK12	(DISABLE_FRED|DISABLE_LAM)
+-#define DISABLED_MASK13	0
+#define DISABLED_MASK13	(DISABLE_INVLPGB)
+ #define DISABLED_MASK14	0
+ #define DISABLED_MASK15	0
+ #define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -183,6 +183,9 @@ static inline void cr4_init_shadow(void)
+ extern unsigned long mmu_cr4_features;
+ extern u32 *trampoline_cr4_features;
+ 
+/* How many pages can be invalidated with one INVLPGB. */
+extern u16 invlpgb_count_max;
+
+ extern void initialize_tlbstate_and_flush(void);
+ 
+ /*
+--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
+@@ -29,6 +29,8 @@
+ 
+ #include "cpu.h"
+ 
+u16 invlpgb_count_max __ro_after_init;
+
+ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+ {
+ 	u32 gprs[8] = { 0 };
+@@ -1145,6 +1147,10 @@ static void cpu_detect_tlb_amd(struct cp
+ 		tlb_lli_2m[ENTRIES] = eax & mask;
+ 
+ 	tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+	/* Max number of pages INVLPGB can invalidate in one shot */
+	if (cpu_has(c, X86_FEATURE_INVLPGB))
+		invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
+ }
+ 
+ static const struct cpu_dev amd_cpu_dev = {
--- a/debian/patches/patchset-pf/invlpgb/0005-x86-mm-Add-INVLPGB-support-code.patch
+++ b/debian/patches/patchset-pf/invlpgb/0005-x86-mm-Add-INVLPGB-support-code.patch
@@ -0,0 +1,170 @@
+From 27bab4a6ed6ee7b7b0e2d216b8802800ef26b2ad Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Fri, 28 Feb 2025 20:32:30 +0100
+Subject: x86/mm: Add INVLPGB support code
+
+Add helper functions and definitions needed to use broadcast TLB
+invalidation on AMD CPUs.
+
+  [ bp:
+      - Cleanup commit message
+      - Improve and expand comments
+      - push the preemption guards inside the invlpgb* helpers
+      - merge improvements from dhansen
+      - add !CONFIG_BROADCAST_TLB_FLUSH function stubs because Clang
+	can't do DCE properly yet and looks at the inline asm and
+	complains about it getting a u64 argument on 32-bit code ]
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20250226030129.530345-4-riel@surriel.com
+---
+ arch/x86/include/asm/tlb.h | 132 +++++++++++++++++++++++++++++++++++++
+ 1 file changed, 132 insertions(+)
+
+--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
+@@ -6,6 +6,9 @@
+ static inline void tlb_flush(struct mmu_gather *tlb);
+ 
+ #include <asm-generic/tlb.h>
+#include <linux/kernel.h>
+#include <vdso/bits.h>
+#include <vdso/page.h>
+ 
+ static inline void tlb_flush(struct mmu_gather *tlb)
+ {
+@@ -25,4 +28,133 @@ static inline void invlpg(unsigned long
+ 	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ }
+ 
+enum addr_stride {
+	PTE_STRIDE = 0,
+	PMD_STRIDE = 1
+};
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+/*
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+ *
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+ * be done in a parallel fashion.
+ *
+ * The instruction takes the number of extra pages to invalidate, beyond
+ * the first page, while __invlpgb gets the more human readable number of
+ * pages to invalidate.
+ *
+ * The bits in rax[0:2] determine respectively which components of the address
+ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
+ * address in the specified range matches.
+ *
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+ * this CPU have completed.
+ */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+			     unsigned long addr, u16 nr_pages,
+			     enum addr_stride stride, u8 flags)
+{
+	u32 edx = (pcid << 16) | asid;
+	u32 ecx = (stride << 31) | (nr_pages - 1);
+	u64 rax = addr | flags;
+
+	/* The low bits in rax are for flags. Verify addr is clean. */
+	VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
+
+	/* INVLPGB; supported in binutils >= 2.36. */
+	asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx));
+}
+
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags)
+{
+	__invlpgb(asid, pcid, 0, 1, 0, flags);
+}
+
+static inline void __tlbsync(void)
+{
+	/*
+	 * TLBSYNC waits for INVLPGB instructions originating on the same CPU
+	 * to have completed. Print a warning if the task has been migrated,
+	 * and might not be waiting on all the INVLPGBs issued during this TLB
+	 * invalidation sequence.
+	 */
+	cant_migrate();
+
+	/* TLBSYNC: supported in binutils >= 0.36. */
+	asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
+}
+#else
+/* Some compilers (I'm looking at you clang!) simply can't do DCE */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+			     unsigned long addr, u16 nr_pages,
+			     enum addr_stride s, u8 flags) { }
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { }
+static inline void __tlbsync(void) { }
+#endif
+
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - FLAG_PCID:			    invalidate all TLB entries matching the PCID
+ *
+ * The first is used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_FLAG_VA			BIT(0)
+#define INVLPGB_FLAG_PCID		BIT(1)
+#define INVLPGB_FLAG_ASID		BIT(2)
+#define INVLPGB_FLAG_INCLUDE_GLOBAL	BIT(3)
+#define INVLPGB_FLAG_FINAL_ONLY		BIT(4)
+#define INVLPGB_FLAG_INCLUDE_NESTED	BIT(5)
+
+/* The implied mode when all bits are clear: */
+#define INVLPGB_MODE_ALL_NONGLOBALS	0UL
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+						unsigned long addr,
+						u16 nr, bool stride)
+{
+	enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE;
+	u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA;
+
+	__invlpgb(0, pcid, addr, nr, str, flags);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+	__invlpgb_all(0, pcid, INVLPGB_FLAG_PCID);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invlpgb_flush_all(void)
+{
+	/*
+	 * TLBSYNC at the end needs to make sure all flushes done on the
+	 * current CPU have been executed system-wide. Therefore, make
+	 * sure nothing gets migrated in-between but disable preemption
+	 * as it is cheaper.
+	 */
+	guard(preempt)();
+	__invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL);
+	__tlbsync();
+}
+
+/* Flush addr, including globals, for all PCIDs. */
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+	__invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invlpgb_flush_all_nonglobals(void)
+{
+	guard(preempt)();
+	__invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS);
+	__tlbsync();
+}
+ #endif /* _ASM_X86_TLB_H */
--- a/debian/patches/patchset-pf/invlpgb/0006-x86-mm-Use-INVLPGB-for-kernel-TLB-flushes.patch
+++ b/debian/patches/patchset-pf/invlpgb/0006-x86-mm-Use-INVLPGB-for-kernel-TLB-flushes.patch
@@ -0,0 +1,97 @@
+From 358d71638f420efe8f7e05ce74aefe13e9320283 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Tue, 25 Feb 2025 22:00:39 -0500
+Subject: x86/mm: Use INVLPGB for kernel TLB flushes
+
+Use broadcast TLB invalidation for kernel addresses when available.
+Remove the need to send IPIs for kernel TLB flushes.
+
+   [ bp: Integrate dhansen's comments additions, merge the
+     flush_tlb_all() change into this one too. ]
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20250226030129.530345-5-riel@surriel.com
+---
+ arch/x86/mm/tlb.c | 48 +++++++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 44 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -1083,7 +1083,6 @@ void flush_tlb_mm_range(struct mm_struct
+ 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
+ }
+ 
+-
+ static void do_flush_tlb_all(void *info)
+ {
+ 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+@@ -1093,7 +1092,32 @@ static void do_flush_tlb_all(void *info)
+ void flush_tlb_all(void)
+ {
+ 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+-	on_each_cpu(do_flush_tlb_all, NULL, 1);
+
+	/* First try (faster) hardware-assisted TLB invalidation. */
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_flush_all();
+	else
+		/* Fall back to the IPI-based invalidation. */
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+/* Flush an arbitrarily large range of memory with INVLPGB. */
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
+{
+	unsigned long addr, nr;
+
+	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+		nr = (info->end - addr) >> PAGE_SHIFT;
+
+		/*
+		 * INVLPGB has a limit on the size of ranges it can
+		 * flush. Break up large flushes.
+		 */
+		nr = clamp_val(nr, 1, invlpgb_count_max);
+
+		invlpgb_flush_addr_nosync(addr, nr);
+	}
+	__tlbsync();
+ }
+ 
+ static void do_kernel_range_flush(void *info)
+@@ -1106,6 +1130,22 @@ static void do_kernel_range_flush(void *
+ 		flush_tlb_one_kernel(addr);
+ }
+ 
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+{
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_flush_all();
+	else
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+{
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		invlpgb_kernel_range_flush(info);
+	else
+		on_each_cpu(do_kernel_range_flush, info, 1);
+}
+
+ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+ {
+ 	struct flush_tlb_info *info;
+@@ -1116,9 +1156,9 @@ void flush_tlb_kernel_range(unsigned lon
+ 				  TLB_GENERATION_INVALID);
+ 
+ 	if (info->end == TLB_FLUSH_ALL)
+-		on_each_cpu(do_flush_tlb_all, NULL, 1);
+		kernel_tlb_flush_all(info);
+ 	else
+-		on_each_cpu(do_kernel_range_flush, info, 1);
+		kernel_tlb_flush_range(info);
+ 
+ 	put_flush_tlb_info();
+ }
--- a/debian/patches/patchset-pf/invlpgb/0007-x86-mm-Use-broadcast-TLB-flushing-in-page-reclaim.patch
+++ b/debian/patches/patchset-pf/invlpgb/0007-x86-mm-Use-broadcast-TLB-flushing-in-page-reclaim.patch
@@ -0,0 +1,32 @@
+From 7cf099de79e12d6c4949f733c8cbb241bb08f07a Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Tue, 25 Feb 2025 22:00:41 -0500
+Subject: x86/mm: Use broadcast TLB flushing in page reclaim
+
+Page reclaim tracks only the CPU(s) where the TLB needs to be flushed, rather
+than all the individual mappings that may be getting invalidated.
+
+Use broadcast TLB flushing when that is available.
+
+  [ bp: Massage commit message. ]
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20250226030129.530345-7-riel@surriel.com
+---
+ arch/x86/mm/tlb.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -1339,7 +1339,9 @@ void arch_tlbbatch_flush(struct arch_tlb
+ 	 * a local TLB flush is needed. Optimize this use-case by calling
+ 	 * flush_tlb_func_local() directly in this case.
+ 	 */
+-	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+		invlpgb_flush_all_nonglobals();
+	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ 		flush_tlb_multi(&batch->cpumask, info);
+ 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ 		lockdep_assert_irqs_enabled();
--- a/debian/patches/patchset-pf/invlpgb/0008-x86-mm-Add-global-ASID-allocation-helper-functions.patch
+++ b/debian/patches/patchset-pf/invlpgb/0008-x86-mm-Add-global-ASID-allocation-helper-functions.patch
@@ -0,0 +1,286 @@
+From f9ecaaca7ac26789d7d3e0d8022b7c99599dc8a3 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Tue, 25 Feb 2025 22:00:42 -0500
+Subject: x86/mm: Add global ASID allocation helper functions
+
+Add functions to manage global ASID space. Multithreaded processes that are
+simultaneously active on 4 or more CPUs can get a global ASID, resulting in the
+same PCID being used for that process on every CPU.
+
+This in turn will allow the kernel to use hardware-assisted TLB flushing
+through AMD INVLPGB or Intel RAR for these processes.
+
+  [ bp:
+   - Extend use_global_asid() comment
+   - s/X86_BROADCAST_TLB_FLUSH/BROADCAST_TLB_FLUSH/g
+   - other touchups ]
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20250226030129.530345-8-riel@surriel.com
+---
+ arch/x86/include/asm/mmu.h         |  12 +++
+ arch/x86/include/asm/mmu_context.h |   2 +
+ arch/x86/include/asm/tlbflush.h    |  37 +++++++
+ arch/x86/mm/tlb.c                  | 154 ++++++++++++++++++++++++++++-
+ 4 files changed, 202 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
+@@ -69,6 +69,18 @@ typedef struct {
+ 	u16 pkey_allocation_map;
+ 	s16 execute_only_pkey;
+ #endif
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+	/*
+	 * The global ASID will be a non-zero value when the process has
+	 * the same ASID across all CPUs, allowing it to make use of
+	 * hardware-assisted remote TLB invalidation like AMD INVLPGB.
+	 */
+	u16 global_asid;
+
+	/* The process is transitioning to a new global ASID number. */
+	bool asid_transition;
+#endif
+ } mm_context_t;
+ 
+ #define INIT_MM_CONTEXT(mm)						\
+--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
+@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
+ #define enter_lazy_tlb enter_lazy_tlb
+ extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+ 
+extern void mm_free_global_asid(struct mm_struct *mm);
+
+ /*
+  * Init a new mm.  Used on mm copies, like at fork()
+  * and on mm's that are brand-new, like at execve().
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -6,6 +6,7 @@
+ #include <linux/mmu_notifier.h>
+ #include <linux/sched.h>
+ 
+#include <asm/barrier.h>
+ #include <asm/processor.h>
+ #include <asm/cpufeature.h>
+ #include <asm/special_insns.h>
+@@ -234,6 +235,42 @@ void flush_tlb_one_kernel(unsigned long
+ void flush_tlb_multi(const struct cpumask *cpumask,
+ 		      const struct flush_tlb_info *info);
+ 
+static inline bool is_dyn_asid(u16 asid)
+{
+	return asid < TLB_NR_DYN_ASIDS;
+}
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+	u16 asid;
+
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return 0;
+
+	asid = smp_load_acquire(&mm->context.global_asid);
+
+	/* mm->context.global_asid is either 0, or a global ASID */
+	VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
+
+	return asid;
+}
+
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
+{
+	/*
+	 * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
+	 * finish_asid_transition() needs to observe asid_transition = true
+	 * once it observes global_asid.
+	 */
+	mm->context.asid_transition = true;
+	smp_store_release(&mm->context.global_asid, asid);
+}
+#else
+static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+#endif /* CONFIG_BROADCAST_TLB_FLUSH */
+
+ #ifdef CONFIG_PARAVIRT
+ #include <asm/paravirt.h>
+ #endif
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -74,13 +74,15 @@
+  * use different names for each of them:
+  *
+  * ASID  - [0, TLB_NR_DYN_ASIDS-1]
+- *         the canonical identifier for an mm
+ *         the canonical identifier for an mm, dynamically allocated on each CPU
+ *         [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
+ *         the canonical, global identifier for an mm, identical across all CPUs
+  *
+- * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * kPCID - [1, MAX_ASID_AVAILABLE]
+  *         the value we write into the PCID part of CR3; corresponds to the
+  *         ASID+1, because PCID 0 is special.
+  *
+- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
+  *         for KPTI each mm has two address spaces and thus needs two
+  *         PCID values, but we can still do with a single ASID denomination
+  *         for each mm. Corresponds to kPCID + 2048.
+@@ -252,6 +254,152 @@ static void choose_new_asid(struct mm_st
+ }
+ 
+ /*
+ * Global ASIDs are allocated for multi-threaded processes that are
+ * active on multiple CPUs simultaneously, giving each of those
+ * processes the same PCID on every CPU, for use with hardware-assisted
+ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
+ *
+ * These global ASIDs are held for the lifetime of the process.
+ */
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
+
+/*
+ * When the search for a free ASID in the global ASID space reaches
+ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
+ * freed global ASIDs are safe to re-use.
+ *
+ * This way the global flush only needs to happen at ASID rollover
+ * time, and not at ASID allocation time.
+ */
+static void reset_global_asid_space(void)
+{
+	lockdep_assert_held(&global_asid_lock);
+
+	invlpgb_flush_all_nonglobals();
+
+	/*
+	 * The TLB flush above makes it safe to re-use the previously
+	 * freed global ASIDs.
+	 */
+	bitmap_andnot(global_asid_used, global_asid_used,
+			global_asid_freed, MAX_ASID_AVAILABLE);
+	bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
+
+	/* Restart the search from the start of global ASID space. */
+	last_global_asid = TLB_NR_DYN_ASIDS;
+}
+
+static u16 allocate_global_asid(void)
+{
+	u16 asid;
+
+	lockdep_assert_held(&global_asid_lock);
+
+	/* The previous allocation hit the edge of available address space */
+	if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
+		reset_global_asid_space();
+
+	asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
+
+	if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
+		/* This should never happen. */
+		VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
+				global_asid_available);
+		return 0;
+	}
+
+	/* Claim this global ASID. */
+	__set_bit(asid, global_asid_used);
+	last_global_asid = asid;
+	global_asid_available--;
+	return asid;
+}
+
+/*
+ * Check whether a process is currently active on more than @threshold CPUs.
+ * This is a cheap estimation on whether or not it may make sense to assign
+ * a global ASID to this process, and use broadcast TLB invalidation.
+ */
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
+{
+	int count = 0;
+	int cpu;
+
+	/* This quick check should eliminate most single threaded programs. */
+	if (cpumask_weight(mm_cpumask(mm)) <= threshold)
+		return false;
+
+	/* Slower check to make sure. */
+	for_each_cpu(cpu, mm_cpumask(mm)) {
+		/* Skip the CPUs that aren't really running this process. */
+		if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
+			continue;
+
+		if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+			continue;
+
+		if (++count > threshold)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Assign a global ASID to the current process, protecting against
+ * races between multiple threads in the process.
+ */
+static void use_global_asid(struct mm_struct *mm)
+{
+	u16 asid;
+
+	guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+	/* This process is already using broadcast TLB invalidation. */
+	if (mm_global_asid(mm))
+		return;
+
+	/*
+	 * The last global ASID was consumed while waiting for the lock.
+	 *
+	 * If this fires, a more aggressive ASID reuse scheme might be
+	 * needed.
+	 */
+	if (!global_asid_available) {
+		VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
+		return;
+	}
+
+	asid = allocate_global_asid();
+	if (!asid)
+		return;
+
+	mm_assign_global_asid(mm, asid);
+}
+
+void mm_free_global_asid(struct mm_struct *mm)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return;
+
+	if (!mm_global_asid(mm))
+		return;
+
+	guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+	/* The global ASID can be re-used only after flush at wrap-around. */
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+	__set_bit(mm->context.global_asid, global_asid_freed);
+
+	mm->context.global_asid = 0;
+	global_asid_available++;
+#endif
+}
+
+/*
+  * Given an ASID, flush the corresponding user ASID.  We can delay this
+  * until the next time we switch to it.
+  *
--- a/debian/patches/patchset-pf/invlpgb/0009-x86-mm-Handle-global-ASID-context-switch-and-TLB-flu.patch
+++ b/debian/patches/patchset-pf/invlpgb/0009-x86-mm-Handle-global-ASID-context-switch-and-TLB-flu.patch
@@ -0,0 +1,219 @@
+From b56070b9f121507cabe352e03f0c534db2d5adc7 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Tue, 25 Feb 2025 22:00:43 -0500
+Subject: x86/mm: Handle global ASID context switch and TLB flush
+
+Do context switch and TLB flush support for processes that use a global
+ASID and PCID across all CPUs.
+
+At both context switch time and TLB flush time, it needs to be checked whether
+a task is switching to a global ASID, and, if so, reload the TLB with the new
+ASID as appropriate.
+
+In both code paths, the TLB flush is avoided if a global ASID is used, because
+the global ASIDs are always kept up to date across CPUs, even when the
+process is not running on a CPU.
+
+  [ bp:
+   - Massage
+   - :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
+  ]
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20250226030129.530345-9-riel@surriel.com
+---
+ arch/x86/include/asm/tlbflush.h | 14 ++++++
+ arch/x86/mm/tlb.c               | 77 ++++++++++++++++++++++++++++++---
+ 2 files changed, 84 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -240,6 +240,11 @@ static inline bool is_dyn_asid(u16 asid)
+ 	return asid < TLB_NR_DYN_ASIDS;
+ }
+ 
+static inline bool is_global_asid(u16 asid)
+{
+	return !is_dyn_asid(asid);
+}
+
+ #ifdef CONFIG_BROADCAST_TLB_FLUSH
+ static inline u16 mm_global_asid(struct mm_struct *mm)
+ {
+@@ -266,9 +271,18 @@ static inline void mm_assign_global_asid
+ 	mm->context.asid_transition = true;
+ 	smp_store_release(&mm->context.global_asid, asid);
+ }
+
+static inline bool mm_in_asid_transition(struct mm_struct *mm)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return false;
+
+	return mm && READ_ONCE(mm->context.asid_transition);
+}
+ #else
+ static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+ static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
+ #endif /* CONFIG_BROADCAST_TLB_FLUSH */
+ 
+ #ifdef CONFIG_PARAVIRT
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -227,6 +227,20 @@ static void choose_new_asid(struct mm_st
+ 		return;
+ 	}
+ 
+	/*
+	 * TLB consistency for global ASIDs is maintained with hardware assisted
+	 * remote TLB flushing. Global ASIDs are always up to date.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+		u16 global_asid = mm_global_asid(next);
+
+		if (global_asid) {
+			*new_asid = global_asid;
+			*need_flush = false;
+			return;
+		}
+	}
+
+ 	if (this_cpu_read(cpu_tlbstate.invalidate_other))
+ 		clear_asid_other();
+ 
+@@ -400,6 +414,23 @@ void mm_free_global_asid(struct mm_struc
+ }
+ 
+ /*
+ * Is the mm transitioning from a CPU-local ASID to a global ASID?
+ */
+static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
+{
+	u16 global_asid = mm_global_asid(mm);
+
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return false;
+
+	/* Process is transitioning to a global ASID */
+	if (global_asid && asid != global_asid)
+		return true;
+
+	return false;
+}
+
+/*
+  * Given an ASID, flush the corresponding user ASID.  We can delay this
+  * until the next time we switch to it.
+  *
+@@ -704,7 +735,8 @@ void switch_mm_irqs_off(struct mm_struct
+ 	 */
+ 	if (prev == next) {
+ 		/* Not actually switching mm's */
+-		VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+		VM_WARN_ON(is_dyn_asid(prev_asid) &&
+			   this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ 			   next->context.ctx_id);
+ 
+ 		/*
+@@ -721,6 +753,20 @@ void switch_mm_irqs_off(struct mm_struct
+ 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
+ 			cpumask_set_cpu(cpu, mm_cpumask(next));
+ 
+		/* Check if the current mm is transitioning to a global ASID */
+		if (mm_needs_global_asid(next, prev_asid)) {
+			next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+			choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+			goto reload_tlb;
+		}
+
+		/*
+		 * Broadcast TLB invalidation keeps this ASID up to date
+		 * all the time.
+		 */
+		if (is_global_asid(prev_asid))
+			return;
+
+ 		/*
+ 		 * If the CPU is not in lazy TLB mode, we are just switching
+ 		 * from one thread in a process to another thread in the same
+@@ -755,6 +801,13 @@ void switch_mm_irqs_off(struct mm_struct
+ 		cond_mitigation(tsk);
+ 
+ 		/*
+		 * Let nmi_uaccess_okay() and finish_asid_transition()
+		 * know that CR3 is changing.
+		 */
+		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+		barrier();
+
+		/*
+ 		 * Leave this CPU in prev's mm_cpumask. Atomic writes to
+ 		 * mm_cpumask can be expensive under contention. The CPU
+ 		 * will be removed lazily at TLB flush time.
+@@ -768,18 +821,12 @@ void switch_mm_irqs_off(struct mm_struct
+ 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ 
+ 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+-
+-		/*
+-		 * Indicate that CR3 is about to change. nmi_uaccess_okay()
+-		 * and others are sensitive to the window where mm_cpumask(),
+-		 * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
+- 		 */
+-		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+-		barrier();
+ 	}
+ 
+reload_tlb:
+ 	new_lam = mm_lam_cr3_mask(next);
+ 	if (need_flush) {
+		VM_WARN_ON_ONCE(is_global_asid(new_asid));
+ 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ 		load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
+@@ -898,7 +945,7 @@ static void flush_tlb_func(void *info)
+ 	const struct flush_tlb_info *f = info;
+ 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ 	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+-	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+	u64 local_tlb_gen;
+ 	bool local = smp_processor_id() == f->initiating_cpu;
+ 	unsigned long nr_invalidate = 0;
+ 	u64 mm_tlb_gen;
+@@ -921,6 +968,16 @@ static void flush_tlb_func(void *info)
+ 	if (unlikely(loaded_mm == &init_mm))
+ 		return;
+ 
+	/* Reload the ASID if transitioning into or out of a global ASID */
+	if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
+		switch_mm_irqs_off(NULL, loaded_mm, NULL);
+		loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+	}
+
+	/* Broadcast ASIDs are always kept up to date with INVLPGB. */
+	if (is_global_asid(loaded_mm_asid))
+		return;
+
+ 	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+ 		   loaded_mm->context.ctx_id);
+ 
+@@ -938,6 +995,8 @@ static void flush_tlb_func(void *info)
+ 		return;
+ 	}
+ 
+	local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
+ 	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
+ 		     f->new_tlb_gen <= local_tlb_gen)) {
+ 		/*
+@@ -1120,7 +1179,7 @@ STATIC_NOPV void native_flush_tlb_multi(
+ 	 * up on the new contents of what used to be page tables, while
+ 	 * doing a speculative memory access.
+ 	 */
+-	if (info->freed_tables)
+	if (info->freed_tables || mm_in_asid_transition(info->mm))
+ 		on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
+ 	else
+ 		on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
--- a/debian/patches/patchset-pf/invlpgb/0010-x86-mm-Add-global-ASID-process-exit-helpers.patch
+++ b/debian/patches/patchset-pf/invlpgb/0010-x86-mm-Add-global-ASID-process-exit-helpers.patch
@@ -0,0 +1,88 @@
+From 6d3b8545e2c3c638363fb449a99b5a6cbab87a49 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Tue, 25 Feb 2025 22:00:44 -0500
+Subject: x86/mm: Add global ASID process exit helpers
+
+A global ASID is allocated for the lifetime of a process. Free the global ASID
+at process exit time.
+
+  [ bp: Massage, create helpers, hide details inside them. ]
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20250226030129.530345-10-riel@surriel.com
+---
+ arch/x86/include/asm/mmu_context.h | 8 +++++++-
+ arch/x86/include/asm/tlbflush.h    | 9 +++++++++
+ 2 files changed, 16 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
+@@ -2,7 +2,6 @@
+ #ifndef _ASM_X86_MMU_CONTEXT_H
+ #define _ASM_X86_MMU_CONTEXT_H
+ 
+-#include <asm/desc.h>
+ #include <linux/atomic.h>
+ #include <linux/mm_types.h>
+ #include <linux/pkeys.h>
+@@ -13,6 +12,7 @@
+ #include <asm/paravirt.h>
+ #include <asm/debugreg.h>
+ #include <asm/gsseg.h>
+#include <asm/desc.h>
+ 
+ extern atomic64_t last_mm_ctx_id;
+ 
+@@ -139,6 +139,9 @@ static inline void mm_reset_untag_mask(s
+ #define enter_lazy_tlb enter_lazy_tlb
+ extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+ 
+#define mm_init_global_asid mm_init_global_asid
+extern void mm_init_global_asid(struct mm_struct *mm);
+
+ extern void mm_free_global_asid(struct mm_struct *mm);
+ 
+ /*
+@@ -163,6 +166,8 @@ static inline int init_new_context(struc
+ 		mm->context.execute_only_pkey = -1;
+ 	}
+ #endif
+
+	mm_init_global_asid(mm);
+ 	mm_reset_untag_mask(mm);
+ 	init_new_context_ldt(mm);
+ 	return 0;
+@@ -172,6 +177,7 @@ static inline int init_new_context(struc
+ static inline void destroy_context(struct mm_struct *mm)
+ {
+ 	destroy_context_ldt(mm);
+	mm_free_global_asid(mm);
+ }
+ 
+ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -261,6 +261,14 @@ static inline u16 mm_global_asid(struct
+ 	return asid;
+ }
+ 
+static inline void mm_init_global_asid(struct mm_struct *mm)
+{
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+		mm->context.global_asid = 0;
+		mm->context.asid_transition = false;
+	}
+}
+
+ static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
+ {
+ 	/*
+@@ -281,6 +289,7 @@ static inline bool mm_in_asid_transition
+ }
+ #else
+ static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+static inline void mm_init_global_asid(struct mm_struct *mm) { }
+ static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+ static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
+ #endif /* CONFIG_BROADCAST_TLB_FLUSH */
--- a/debian/patches/patchset-pf/invlpgb/0011-x86-mm-Enable-broadcast-TLB-invalidation-for-multi-t.patch
+++ b/debian/patches/patchset-pf/invlpgb/0011-x86-mm-Enable-broadcast-TLB-invalidation-for-multi-t.patch
@@ -0,0 +1,219 @@
+From 077e9ceb65f514ea63afc65cce86ce8677e77012 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Tue, 25 Feb 2025 22:00:45 -0500
+Subject: x86/mm: Enable broadcast TLB invalidation for multi-threaded
+ processes
+
+There is not enough room in the 12-bit ASID address space to hand out
+broadcast ASIDs to every process. Only hand out broadcast ASIDs to processes
+when they are observed to be simultaneously running on 4 or more CPUs.
+
+This also allows single threaded process to continue using the cheaper, local
+TLB invalidation instructions like INVLPGB.
+
+Due to the structure of flush_tlb_mm_range(), the INVLPGB flushing is done in
+a generically named broadcast_tlb_flush() function which can later also be
+used for Intel RAR.
+
+Combined with the removal of unnecessary lru_add_drain calls() (see
+https://lore.kernel.org/r/20241219153253.3da9e8aa@fangorn) this results in
+a nice performance boost for the will-it-scale tlb_flush2_threads test on an
+AMD Milan system with 36 cores:
+
+  - vanilla kernel:           527k loops/second
+  - lru_add_drain removal:    731k loops/second
+  - only INVLPGB:             527k loops/second
+  - lru_add_drain + INVLPGB: 1157k loops/second
+
+Profiling with only the INVLPGB changes showed while TLB invalidation went
+down from 40% of the total CPU time to only around 4% of CPU time, the
+contention simply moved to the LRU lock.
+
+Fixing both at the same time about doubles the number of iterations per second
+from this case.
+
+Comparing will-it-scale tlb_flush2_threads with several different numbers of
+threads on a 72 CPU AMD Milan shows similar results. The number represents the
+total number of loops per second across all the threads:
+
+  threads	tip		INVLPGB
+
+  1		315k		304k
+  2		423k		424k
+  4		644k		1032k
+  8		652k		1267k
+  16		737k		1368k
+  32		759k		1199k
+  64		636k		1094k
+  72		609k		993k
+
+1 and 2 thread performance is similar with and without INVLPGB, because
+INVLPGB is only used on processes using 4 or more CPUs simultaneously.
+
+The number is the median across 5 runs.
+
+Some numbers closer to real world performance can be found at Phoronix, thanks
+to Michael:
+
+https://www.phoronix.com/news/AMD-INVLPGB-Linux-Benefits
+
+  [ bp:
+   - Massage
+   - :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
+   - :%s/\<clear_asid_transition\>/mm_clear_asid_transition/cgi
+   - Fold in a 0day bot fix: https://lore.kernel.org/oe-kbuild-all/202503040000.GtiWUsBm-lkp@intel.com
+   ]
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
+Link: https://lore.kernel.org/r/20250226030129.530345-11-riel@surriel.com
+---
+ arch/x86/include/asm/tlbflush.h |   6 ++
+ arch/x86/mm/tlb.c               | 104 +++++++++++++++++++++++++++++++-
+ 2 files changed, 109 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -280,6 +280,11 @@ static inline void mm_assign_global_asid
+ 	smp_store_release(&mm->context.global_asid, asid);
+ }
+ 
+static inline void mm_clear_asid_transition(struct mm_struct *mm)
+{
+	WRITE_ONCE(mm->context.asid_transition, false);
+}
+
+ static inline bool mm_in_asid_transition(struct mm_struct *mm)
+ {
+ 	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+@@ -291,6 +296,7 @@ static inline bool mm_in_asid_transition
+ static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+ static inline void mm_init_global_asid(struct mm_struct *mm) { }
+ static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
+ static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
+ #endif /* CONFIG_BROADCAST_TLB_FLUSH */
+ 
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -431,6 +431,105 @@ static bool mm_needs_global_asid(struct
+ }
+ 
+ /*
+ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
+ * systems have over 8k CPUs. Because of this potential ASID shortage,
+ * global ASIDs are handed out to processes that have frequent TLB
+ * flushes and are active on 4 or more CPUs simultaneously.
+ */
+static void consider_global_asid(struct mm_struct *mm)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+		return;
+
+	/* Check every once in a while. */
+	if ((current->pid & 0x1f) != (jiffies & 0x1f))
+		return;
+
+	/*
+	 * Assign a global ASID if the process is active on
+	 * 4 or more CPUs simultaneously.
+	 */
+	if (mm_active_cpus_exceeds(mm, 3))
+		use_global_asid(mm);
+}
+
+static void finish_asid_transition(struct flush_tlb_info *info)
+{
+	struct mm_struct *mm = info->mm;
+	int bc_asid = mm_global_asid(mm);
+	int cpu;
+
+	if (!mm_in_asid_transition(mm))
+		return;
+
+	for_each_cpu(cpu, mm_cpumask(mm)) {
+		/*
+		 * The remote CPU is context switching. Wait for that to
+		 * finish, to catch the unlikely case of it switching to
+		 * the target mm with an out of date ASID.
+		 */
+		while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
+			cpu_relax();
+
+		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
+			continue;
+
+		/*
+		 * If at least one CPU is not using the global ASID yet,
+		 * send a TLB flush IPI. The IPI should cause stragglers
+		 * to transition soon.
+		 *
+		 * This can race with the CPU switching to another task;
+		 * that results in a (harmless) extra IPI.
+		 */
+		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
+			flush_tlb_multi(mm_cpumask(info->mm), info);
+			return;
+		}
+	}
+
+	/* All the CPUs running this process are using the global ASID. */
+	mm_clear_asid_transition(mm);
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+	bool pmd = info->stride_shift == PMD_SHIFT;
+	unsigned long asid = mm_global_asid(info->mm);
+	unsigned long addr = info->start;
+
+	/*
+	 * TLB flushes with INVLPGB are kicked off asynchronously.
+	 * The inc_mm_tlb_gen() guarantees page table updates are done
+	 * before these TLB flushes happen.
+	 */
+	if (info->end == TLB_FLUSH_ALL) {
+		invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (cpu_feature_enabled(X86_FEATURE_PTI))
+			invlpgb_flush_single_pcid_nosync(user_pcid(asid));
+	} else do {
+		unsigned long nr = 1;
+
+		if (info->stride_shift <= PMD_SHIFT) {
+			nr = (info->end - addr) >> info->stride_shift;
+			nr = clamp_val(nr, 1, invlpgb_count_max);
+		}
+
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+		if (cpu_feature_enabled(X86_FEATURE_PTI))
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+
+		addr += nr << info->stride_shift;
+	} while (addr < info->end);
+
+	finish_asid_transition(info);
+
+	/* Wait for the INVLPGBs kicked off above to finish. */
+	__tlbsync();
+}
+
+/*
+  * Given an ASID, flush the corresponding user ASID.  We can delay this
+  * until the next time we switch to it.
+  *
+@@ -1275,9 +1374,12 @@ void flush_tlb_mm_range(struct mm_struct
+ 	 * a local TLB flush is needed. Optimize this use-case by calling
+ 	 * flush_tlb_func_local() directly in this case.
+ 	 */
+-	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+	if (mm_global_asid(mm)) {
+		broadcast_tlb_flush(info);
+	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ 		info->trim_cpumask = should_trim_cpumask(mm);
+ 		flush_tlb_multi(mm_cpumask(mm), info);
+		consider_global_asid(mm);
+ 	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ 		lockdep_assert_irqs_enabled();
+ 		local_irq_disable();
--- a/debian/patches/patchset-pf/invlpgb/0012-x86-mm-Enable-AMD-translation-cache-extensions.patch
+++ b/debian/patches/patchset-pf/invlpgb/0012-x86-mm-Enable-AMD-translation-cache-extensions.patch
@@ -0,0 +1,83 @@
+From 1994cff363a37aff5b1232ca9f757b02ae244956 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Tue, 25 Feb 2025 22:00:47 -0500
+Subject: x86/mm: Enable AMD translation cache extensions
+
+With AMD TCE (translation cache extensions) only the intermediate mappings
+that cover the address range zapped by INVLPG / INVLPGB get invalidated,
+rather than all intermediate mappings getting zapped at every TLB invalidation.
+
+This can help reduce the TLB miss rate, by keeping more intermediate mappings
+in the cache.
+
+From the AMD manual:
+
+Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit to
+1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on TLB
+entries. When this bit is 0, these instructions remove the target PTE from the
+TLB as well as all upper-level table entries that are cached in the TLB,
+whether or not they are associated with the target PTE.  When this bit is set,
+these instructions will remove the target PTE and only those upper-level
+entries that lead to the target PTE in the page table hierarchy, leaving
+unrelated upper-level entries intact.
+
+  [ bp: use cpu_has()... I know, it is a mess. ]
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20250226030129.530345-13-riel@surriel.com
+---
+ arch/x86/include/asm/msr-index.h       | 2 ++
+ arch/x86/kernel/cpu/amd.c              | 4 ++++
+ tools/arch/x86/include/asm/msr-index.h | 2 ++
+ 3 files changed, 8 insertions(+)
+
+--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
+@@ -25,6 +25,7 @@
+ #define _EFER_SVME		12 /* Enable virtualization */
+ #define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
+ #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE		15 /* Enable Translation Cache Extensions */
+ #define _EFER_AUTOIBRS		21 /* Enable Automatic IBRS */
+ 
+ #define EFER_SCE		(1<<_EFER_SCE)
+@@ -34,6 +35,7 @@
+ #define EFER_SVME		(1<<_EFER_SVME)
+ #define EFER_LMSLE		(1<<_EFER_LMSLE)
+ #define EFER_FFXSR		(1<<_EFER_FFXSR)
+#define EFER_TCE		(1<<_EFER_TCE)
+ #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
+ 
+ /*
+--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
+@@ -1081,6 +1081,10 @@ static void init_amd(struct cpuinfo_x86
+ 
+ 	/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ 	clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+	/* Enable Translation Cache Extension */
+	if (cpu_has(c, X86_FEATURE_TCE))
+		msr_set_bit(MSR_EFER, _EFER_TCE);
+ }
+ 
+ #ifdef CONFIG_X86_32
+--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
+@@ -25,6 +25,7 @@
+ #define _EFER_SVME		12 /* Enable virtualization */
+ #define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
+ #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE		15 /* Enable Translation Cache Extensions */
+ #define _EFER_AUTOIBRS		21 /* Enable Automatic IBRS */
+ 
+ #define EFER_SCE		(1<<_EFER_SCE)
+@@ -34,6 +35,7 @@
+ #define EFER_SVME		(1<<_EFER_SVME)
+ #define EFER_LMSLE		(1<<_EFER_LMSLE)
+ #define EFER_FFXSR		(1<<_EFER_FFXSR)
+#define EFER_TCE		(1<<_EFER_TCE)
+ #define EFER_AUTOIBRS		(1<<_EFER_AUTOIBRS)
+ 
+ /*
--- a/debian/patches/patchset-pf/invlpgb/0013-x86-mm-Always-set-the-ASID-valid-bit-for-the-INVLPGB.patch
+++ b/debian/patches/patchset-pf/invlpgb/0013-x86-mm-Always-set-the-ASID-valid-bit-for-the-INVLPGB.patch
@@ -0,0 +1,121 @@
+From 5932a2c8122050c4a2f71588778feb0677fe32b4 Mon Sep 17 00:00:00 2001
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Tue, 4 Mar 2025 12:59:56 +0100
+Subject: x86/mm: Always set the ASID valid bit for the INVLPGB instruction
+
+When executing the INVLPGB instruction on a bare-metal host or hypervisor, if
+the ASID valid bit is not set, the instruction will flush the TLB entries that
+match the specified criteria for any ASID, not just the those of the host. If
+virtual machines are running on the system, this may result in inadvertent
+flushes of guest TLB entries.
+
+When executing the INVLPGB instruction in a guest and the INVLPGB instruction is
+not intercepted by the hypervisor, the hardware will replace the requested ASID
+with the guest ASID and set the ASID valid bit before doing the broadcast
+invalidation. Thus a guest is only able to flush its own TLB entries.
+
+So to limit the host TLB flushing reach, always set the ASID valid bit using an
+ASID value of 0 (which represents the host/hypervisor). This will will result in
+the desired effect in both host and guest.
+
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20250304120449.GHZ8bsYYyEBOKQIxBm@fat_crate.local
+---
+ arch/x86/include/asm/tlb.h | 58 +++++++++++++++++++++-----------------
+ 1 file changed, 32 insertions(+), 26 deletions(-)
+
+--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
+@@ -33,6 +33,27 @@ enum addr_stride {
+ 	PMD_STRIDE = 1
+ };
+ 
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - FLAG_PCID:			    invalidate all TLB entries matching the PCID
+ *
+ * The first is used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_FLAG_VA			BIT(0)
+#define INVLPGB_FLAG_PCID		BIT(1)
+#define INVLPGB_FLAG_ASID		BIT(2)
+#define INVLPGB_FLAG_INCLUDE_GLOBAL	BIT(3)
+#define INVLPGB_FLAG_FINAL_ONLY		BIT(4)
+#define INVLPGB_FLAG_INCLUDE_NESTED	BIT(5)
+
+/* The implied mode when all bits are clear: */
+#define INVLPGB_MODE_ALL_NONGLOBALS	0UL
+
+ #ifdef CONFIG_BROADCAST_TLB_FLUSH
+ /*
+  * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+@@ -40,14 +61,20 @@ enum addr_stride {
+  * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+  * be done in a parallel fashion.
+  *
+- * The instruction takes the number of extra pages to invalidate, beyond
+- * the first page, while __invlpgb gets the more human readable number of
+- * pages to invalidate.
+ * The instruction takes the number of extra pages to invalidate, beyond the
+ * first page, while __invlpgb gets the more human readable number of pages to
+ * invalidate.
+  *
+  * The bits in rax[0:2] determine respectively which components of the address
+  * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
+  * address in the specified range matches.
+  *
+ * Since it is desired to only flush TLB entries for the ASID that is executing
+ * the instruction (a host/hypervisor or a guest), the ASID valid bit should
+ * always be set. On a host/hypervisor, the hardware will use the ASID value
+ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will
+ * use the actual ASID value of the guest.
+ *
+  * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+  * this CPU have completed.
+  */
+@@ -55,9 +82,9 @@ static inline void __invlpgb(unsigned lo
+ 			     unsigned long addr, u16 nr_pages,
+ 			     enum addr_stride stride, u8 flags)
+ {
+-	u32 edx = (pcid << 16) | asid;
+	u64 rax = addr | flags | INVLPGB_FLAG_ASID;
+ 	u32 ecx = (stride << 31) | (nr_pages - 1);
+-	u64 rax = addr | flags;
+	u32 edx = (pcid << 16) | asid;
+ 
+ 	/* The low bits in rax are for flags. Verify addr is clean. */
+ 	VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
+@@ -93,27 +120,6 @@ static inline void __invlpgb_all(unsigne
+ static inline void __tlbsync(void) { }
+ #endif
+ 
+-/*
+- * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+- * of the three. For example:
+- * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+- * - FLAG_PCID:			    invalidate all TLB entries matching the PCID
+- *
+- * The first is used to invalidate (kernel) mappings at a particular
+- * address across all processes.
+- *
+- * The latter invalidates all TLB entries matching a PCID.
+- */
+-#define INVLPGB_FLAG_VA			BIT(0)
+-#define INVLPGB_FLAG_PCID		BIT(1)
+-#define INVLPGB_FLAG_ASID		BIT(2)
+-#define INVLPGB_FLAG_INCLUDE_GLOBAL	BIT(3)
+-#define INVLPGB_FLAG_FINAL_ONLY		BIT(4)
+-#define INVLPGB_FLAG_INCLUDE_NESTED	BIT(5)
+-
+-/* The implied mode when all bits are clear: */
+-#define INVLPGB_MODE_ALL_NONGLOBALS	0UL
+-
+ static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ 						unsigned long addr,
+ 						u16 nr, bool stride)
--- a/debian/patches/patchset-pf/invlpgb/0014-x86-mm-Only-do-broadcast-flush-from-reclaim-if-pages.patch
+++ b/debian/patches/patchset-pf/invlpgb/0014-x86-mm-Only-do-broadcast-flush-from-reclaim-if-pages.patch
@@ -0,0 +1,70 @@
+From 0e0a5ca37a8e3b06f450f4093ba1b6d6f33c2161 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Wed, 19 Mar 2025 13:25:20 -0400
+Subject: x86/mm: Only do broadcast flush from reclaim if pages were unmapped
+
+Track whether pages were unmapped from any MM (even ones with a currently
+empty mm_cpumask) by the reclaim code, to figure out whether or not
+broadcast TLB flush should be done when reclaim finishes.
+
+The reason any MM must be tracked, and not only ones contributing to the
+tlbbatch cpumask, is that broadcast ASIDs are expected to be kept up to
+date even on CPUs where the MM is not currently active.
+
+This change allows reclaim to avoid doing TLB flushes when only clean page
+cache pages and/or slab memory were reclaimed, which is fairly common.
+
+( This is a simpler alternative to the code that was in my INVLPGB series
+  before, and it seems to capture most of the benefit due to how common
+  it is to reclaim only page cache. )
+
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/r/20250319132520.6b10ad90@fangorn
+---
+ arch/x86/include/asm/tlbbatch.h | 5 +++++
+ arch/x86/include/asm/tlbflush.h | 1 +
+ arch/x86/mm/tlb.c               | 3 ++-
+ 3 files changed, 8 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
+@@ -10,6 +10,11 @@ struct arch_tlbflush_unmap_batch {
+ 	 * the PFNs being flushed..
+ 	 */
+ 	struct cpumask cpumask;
+	/*
+	 * Set if pages were unmapped from any MM, even one that does not
+	 * have active CPUs in its cpumask.
+	 */
+	bool unmapped_pages;
+ };
+ 
+ #endif /* _ARCH_X86_TLBBATCH_H */
+--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
+@@ -353,6 +353,7 @@ static inline void arch_tlbbatch_add_pen
+ {
+ 	inc_mm_tlb_gen(mm);
+ 	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+	batch->unmapped_pages = true;
+ 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+ }
+ 
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -1648,8 +1648,9 @@ void arch_tlbbatch_flush(struct arch_tlb
+ 	 * a local TLB flush is needed. Optimize this use-case by calling
+ 	 * flush_tlb_func_local() directly in this case.
+ 	 */
+-	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
+ 		invlpgb_flush_all_nonglobals();
+		batch->unmapped_pages = false;
+ 	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ 		flush_tlb_multi(&batch->cpumask, info);
+ 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
--- a/debian/patches/patchset-pf/invlpgb/0015-x86-mm-Eliminate-window-where-TLB-flushes-may-be-ina.patch
+++ b/debian/patches/patchset-pf/invlpgb/0015-x86-mm-Eliminate-window-where-TLB-flushes-may-be-ina.patch
@@ -0,0 +1,92 @@
+From 6ae491224973eb4013ee67a8c05c420f057d5fee Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Thu, 8 May 2025 15:41:32 -0700
+Subject: x86/mm: Eliminate window where TLB flushes may be inadvertently
+ skipped
+
+tl;dr: There is a window in the mm switching code where the new CR3 is
+set and the CPU should be getting TLB flushes for the new mm.  But
+should_flush_tlb() has a bug and suppresses the flush.  Fix it by
+widening the window where should_flush_tlb() sends an IPI.
+
+Long Version:
+
+=== History ===
+
+There were a few things leading up to this.
+
+First, updating mm_cpumask() was observed to be too expensive, so it was
+made lazier.  But being lazy caused too many unnecessary IPIs to CPUs
+due to the now-lazy mm_cpumask().  So code was added to cull
+mm_cpumask() periodically[2].  But that culling was a bit too aggressive
+and skipped sending TLB flushes to CPUs that need them.  So here we are
+again.
+
+=== Problem ===
+
+The too-aggressive code in should_flush_tlb() strikes in this window:
+
+	// Turn on IPIs for this CPU/mm combination, but only
+	// if should_flush_tlb() agrees:
+	cpumask_set_cpu(cpu, mm_cpumask(next));
+
+	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+	choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+	load_new_mm_cr3(need_flush);
+	// ^ After 'need_flush' is set to false, IPIs *MUST*
+	// be sent to this CPU and not be ignored.
+
+        this_cpu_write(cpu_tlbstate.loaded_mm, next);
+	// ^ Not until this point does should_flush_tlb()
+	// become true!
+
+should_flush_tlb() will suppress TLB flushes between load_new_mm_cr3()
+and writing to 'loaded_mm', which is a window where they should not be
+suppressed.  Whoops.
+
+=== Solution ===
+
+Thankfully, the fuzzy "just about to write CR3" window is already marked
+with loaded_mm==LOADED_MM_SWITCHING.  Simply checking for that state in
+should_flush_tlb() is sufficient to ensure that the CPU is targeted with
+an IPI.
+
+This will cause more TLB flush IPIs.  But the window is relatively small
+and I do not expect this to cause any kind of measurable performance
+impact.
+
+Update the comment where LOADED_MM_SWITCHING is written since it grew
+yet another user.
+
+Peter Z also raised a concern that should_flush_tlb() might not observe
+'loaded_mm' and 'is_lazy' in the same order that switch_mm_irqs_off()
+writes them.  Add a barrier to ensure that they are observed in the
+order they are written.
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Acked-by: Rik van Riel <riel@surriel.com>
+Link: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ [1]
+Fixes: 6db2526c1d69 ("x86/mm/tlb: Only trim the mm_cpumask once a second") [2]
+Reported-by: Stephen Dolan <sdolan@janestreet.com>
+Cc: stable@vger.kernel.org
+Acked-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+---
+ arch/x86/mm/tlb.c | 22 +++++++++++++++++++---
+ 1 file changed, 19 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
+@@ -900,8 +900,9 @@ void switch_mm_irqs_off(struct mm_struct
+ 		cond_mitigation(tsk);
+ 
+ 		/*
+-		 * Let nmi_uaccess_okay() and finish_asid_transition()
+-		 * know that CR3 is changing.
+		 * Indicate that CR3 is about to change. nmi_uaccess_okay()
+		 * and others are sensitive to the window where mm_cpumask(),
+		 * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
+ 		 */
+ 		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ 		barrier();
--- a/debian/patches/patchset-pf/smb/0001-ksmbd-prevent-rename-with-empty-string.patch
+++ b/debian/patches/patchset-pf/smb/0001-ksmbd-prevent-rename-with-empty-string.patch
@@ -1,33 +0,0 @@
-From 7b3f0f8d11f1b4319f593ba02d4dece890755dfa Mon Sep 17 00:00:00 2001
-From: Namjae Jeon <linkinjeon@kernel.org>
-Date: Wed, 30 Apr 2025 11:18:28 +0900
-Subject: ksmbd: prevent rename with empty string
-
-Client can send empty newname string to ksmbd server.
-It will cause a kernel oops from d_alloc.
-This patch return the error when attempting to rename
-a file or directory with an empty new name string.
-
-Cc: stable@vger.kernel.org
-Reported-by: Norbert Szetei <norbert@doyensec.com>
-Tested-by: Norbert Szetei <norbert@doyensec.com>
-Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
-Signed-off-by: Steve French <stfrench@microsoft.com>
---
- fs/smb/server/smb2pdu.c | 5 +++++
- 1 file changed, 5 insertions(+)
-
--- a/fs/smb/server/smb2pdu.c
-+++ b/fs/smb/server/smb2pdu.c
-@@ -633,6 +633,11 @@ smb2_get_name(const char *src, const int
- 		return name;
- 	}
- 
-+	if (*name == '\0') {
-+		kfree(name);
-+		return ERR_PTR(-EINVAL);
-+	}
-+
- 	if (*name == '\\') {
- 		pr_err("not allow directory name included leading slash\n");
- 		kfree(name);
--- a/debian/patches/patchset-pf/smb/0001-smb-client-fix-memory-leak-during-error-handling-for.patch
+++ b/debian/patches/patchset-pf/smb/0001-smb-client-fix-memory-leak-during-error-handling-for.patch
@@ -0,0 +1,35 @@
+From 8ef14a884df5aaf48cf5f7ce6c91e7318cb07d4e Mon Sep 17 00:00:00 2001
+From: Jethro Donaldson <devel@jro.nz>
+Date: Thu, 15 May 2025 01:23:23 +1200
+Subject: smb: client: fix memory leak during error handling for POSIX mkdir
+
+The response buffer for the CREATE request handled by smb311_posix_mkdir()
+is leaked on the error path (goto err_free_rsp_buf) because the structure
+pointer *rsp passed to free_rsp_buf() is not assigned until *after* the
+error condition is checked.
+
+As *rsp is initialised to NULL, free_rsp_buf() becomes a no-op and the leak
+is instead reported by __kmem_cache_shutdown() upon subsequent rmmod of
+cifs.ko if (and only if) the error path has been hit.
+
+Pass rsp_iov.iov_base to free_rsp_buf() instead, similar to the code in
+other functions in smb2pdu.c for which *rsp is assigned late.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Jethro Donaldson <devel@jro.nz>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+---
+ fs/smb/client/smb2pdu.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
+@@ -2967,7 +2967,7 @@ replay_again:
+ 	/* Eventually save off posix specific response info and timestamps */
+ 
+ err_free_rsp_buf:
+-	free_rsp_buf(resp_buftype, rsp);
+	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+ 	kfree(pc_buf);
+ err_free_req:
+ 	cifs_small_buf_release(req);
--- a/debian/patches/patchset-pf/smb/0002-ksmbd-prevent-out-of-bounds-stream-writes-by-validat.patch
+++ b/debian/patches/patchset-pf/smb/0002-ksmbd-prevent-out-of-bounds-stream-writes-by-validat.patch
@@ -1,37 +0,0 @@
-From fb87d390de327c76b11ed544de83771118f7b0c5 Mon Sep 17 00:00:00 2001
-From: Norbert Szetei <norbert@doyensec.com>
-Date: Fri, 2 May 2025 08:21:58 +0900
-Subject: ksmbd: prevent out-of-bounds stream writes by validating *pos
-
-ksmbd_vfs_stream_write() did not validate whether the write offset
-(*pos) was within the bounds of the existing stream data length (v_len).
-If *pos was greater than or equal to v_len, this could lead to an
-out-of-bounds memory write.
-
-This patch adds a check to ensure *pos is less than v_len before
-proceeding. If the condition fails, -EINVAL is returned.
-
-Cc: stable@vger.kernel.org
-Signed-off-by: Norbert Szetei <norbert@doyensec.com>
-Acked-by: Namjae Jeon <linkinjeon@kernel.org>
-Signed-off-by: Steve French <stfrench@microsoft.com>
---
- fs/smb/server/vfs.c | 7 +++++++
- 1 file changed, 7 insertions(+)
-
--- a/fs/smb/server/vfs.c
-+++ b/fs/smb/server/vfs.c
-@@ -443,6 +443,13 @@ static int ksmbd_vfs_stream_write(struct
- 		goto out;
- 	}
- 
-+	if (v_len <= *pos) {
-+		pr_err("stream write position %lld is out of bounds (stream length: %zd)\n",
-+				*pos, v_len);
-+		err = -EINVAL;
-+		goto out;
-+	}
-+
- 	if (v_len < size) {
- 		wbuf = kvzalloc(size, KSMBD_DEFAULT_GFP);
- 		if (!wbuf) {
--- a/debian/patches/patchset-pf/smb/0003-ksmbd-Fix-UAF-in-__close_file_table_ids.patch
+++ b/debian/patches/patchset-pf/smb/0003-ksmbd-Fix-UAF-in-__close_file_table_ids.patch
@@ -1,74 +0,0 @@
-From 67ea573ce44aeac74e659879cdeb6ac39212d0b9 Mon Sep 17 00:00:00 2001
-From: Sean Heelan <seanheelan@gmail.com>
-Date: Tue, 6 May 2025 22:04:52 +0900
-Subject: ksmbd: Fix UAF in __close_file_table_ids
-
-A use-after-free is possible if one thread destroys the file
-via __ksmbd_close_fd while another thread holds a reference to
-it. The existing checks on fp->refcount are not sufficient to
-prevent this.
-
-The fix takes ft->lock around the section which removes the
-file from the file table. This prevents two threads acquiring the
-same file pointer via __close_file_table_ids, as well as the other
-functions which retrieve a file from the IDR and which already use
-this same lock.
-
-Cc: stable@vger.kernel.org
-Signed-off-by: Sean Heelan <seanheelan@gmail.com>
-Acked-by: Namjae Jeon <linkinjeon@kernel.org>
-Signed-off-by: Steve French <stfrench@microsoft.com>
---
- fs/smb/server/vfs_cache.c | 33 ++++++++++++++++++++++++++-------
- 1 file changed, 26 insertions(+), 7 deletions(-)
-
--- a/fs/smb/server/vfs_cache.c
-+++ b/fs/smb/server/vfs_cache.c
-@@ -661,21 +661,40 @@ __close_file_table_ids(struct ksmbd_file
- 		       bool (*skip)(struct ksmbd_tree_connect *tcon,
- 				    struct ksmbd_file *fp))
- {
-	unsigned int			id;
-	struct ksmbd_file		*fp;
-	int				num = 0;
-+	struct ksmbd_file *fp;
-+	unsigned int id = 0;
-+	int num = 0;
- 
-	idr_for_each_entry(ft->idr, fp, id) {
-		if (skip(tcon, fp))
-+	while (1) {
-+		write_lock(&ft->lock);
-+		fp = idr_get_next(ft->idr, &id);
-+		if (!fp) {
-+			write_unlock(&ft->lock);
-+			break;
-+		}
-+
-+		if (skip(tcon, fp) ||
-+		    !atomic_dec_and_test(&fp->refcount)) {
-+			id++;
-+			write_unlock(&ft->lock);
- 			continue;
-+		}
- 
- 		set_close_state_blocked_works(fp);
-+		idr_remove(ft->idr, fp->volatile_id);
-+		fp->volatile_id = KSMBD_NO_FID;
-+		write_unlock(&ft->lock);
-+
-+		down_write(&fp->f_ci->m_lock);
-+		list_del_init(&fp->node);
-+		up_write(&fp->f_ci->m_lock);
- 
-		if (!atomic_dec_and_test(&fp->refcount))
-			continue;
- 		__ksmbd_close_fd(ft, fp);
-+
- 		num++;
-+		id++;
- 	}
-+
- 	return num;
- }
-