release 6.14.7
This commit is contained in:
@@ -17,7 +17,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
|
||||
--- a/arch/x86/include/asm/msr-index.h
|
||||
+++ b/arch/x86/include/asm/msr-index.h
|
||||
@@ -701,15 +701,17 @@
|
||||
@@ -709,15 +709,17 @@
|
||||
#define MSR_AMD_CPPC_REQ 0xc00102b3
|
||||
#define MSR_AMD_CPPC_STATUS 0xc00102b4
|
||||
|
||||
|
@@ -13,7 +13,7 @@ Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
|
||||
--- a/init/Kconfig
|
||||
+++ b/init/Kconfig
|
||||
@@ -1600,6 +1600,16 @@ config SYSCTL_ARCH_UNALIGN_ALLOW
|
||||
@@ -1603,6 +1603,16 @@ config SYSCTL_ARCH_UNALIGN_ALLOW
|
||||
the unaligned access emulation.
|
||||
see arch/parisc/kernel/unaligned.c for reference
|
||||
|
||||
@@ -30,7 +30,7 @@ Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
config HAVE_PCSPKR_PLATFORM
|
||||
bool
|
||||
|
||||
@@ -1644,16 +1654,6 @@ config SGETMASK_SYSCALL
|
||||
@@ -1647,16 +1657,6 @@ config SGETMASK_SYSCALL
|
||||
|
||||
If unsure, leave the default option here.
|
||||
|
||||
|
@@ -1,98 +0,0 @@
|
||||
From dca14df8b269f207ac834149126964039142b596 Mon Sep 17 00:00:00 2001
|
||||
From: Alex Deucher <alexander.deucher@amd.com>
|
||||
Date: Thu, 1 May 2025 13:00:16 -0400
|
||||
Subject: Revert "drm/amd: Stop evicting resources on APUs in suspend"
|
||||
|
||||
This reverts commit 3a9626c816db901def438dc2513622e281186d39.
|
||||
|
||||
This breaks S4 because we end up setting the s3/s0ix flags
|
||||
even when we are entering s4 since prepare is used by both
|
||||
flows. The causes both the S3/s0ix and s4 flags to be set
|
||||
which breaks several checks in the driver which assume they
|
||||
are mutually exclusive.
|
||||
|
||||
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3634
|
||||
Cc: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
|
||||
---
|
||||
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 --
|
||||
drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 18 ------------------
|
||||
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++---------
|
||||
3 files changed, 2 insertions(+), 29 deletions(-)
|
||||
|
||||
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
|
||||
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
|
||||
@@ -1594,11 +1594,9 @@ static inline void amdgpu_acpi_get_backl
|
||||
#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
|
||||
bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev);
|
||||
bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev);
|
||||
-void amdgpu_choose_low_power_state(struct amdgpu_device *adev);
|
||||
#else
|
||||
static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; }
|
||||
static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return false; }
|
||||
-static inline void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { }
|
||||
#endif
|
||||
|
||||
void amdgpu_register_gpu_instance(struct amdgpu_device *adev);
|
||||
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
|
||||
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
|
||||
@@ -1533,22 +1533,4 @@ bool amdgpu_acpi_is_s0ix_active(struct a
|
||||
#endif /* CONFIG_AMD_PMC */
|
||||
}
|
||||
|
||||
-/**
|
||||
- * amdgpu_choose_low_power_state
|
||||
- *
|
||||
- * @adev: amdgpu_device_pointer
|
||||
- *
|
||||
- * Choose the target low power state for the GPU
|
||||
- */
|
||||
-void amdgpu_choose_low_power_state(struct amdgpu_device *adev)
|
||||
-{
|
||||
- if (adev->in_runpm)
|
||||
- return;
|
||||
-
|
||||
- if (amdgpu_acpi_is_s0ix_active(adev))
|
||||
- adev->in_s0ix = true;
|
||||
- else if (amdgpu_acpi_is_s3_active(adev))
|
||||
- adev->in_s3 = true;
|
||||
-}
|
||||
-
|
||||
#endif /* CONFIG_SUSPEND */
|
||||
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
|
||||
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
|
||||
@@ -4861,15 +4861,13 @@ int amdgpu_device_prepare(struct drm_dev
|
||||
struct amdgpu_device *adev = drm_to_adev(dev);
|
||||
int i, r;
|
||||
|
||||
- amdgpu_choose_low_power_state(adev);
|
||||
-
|
||||
if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
|
||||
return 0;
|
||||
|
||||
/* Evict the majority of BOs before starting suspend sequence */
|
||||
r = amdgpu_device_evict_resources(adev);
|
||||
if (r)
|
||||
- goto unprepare;
|
||||
+ return r;
|
||||
|
||||
flush_delayed_work(&adev->gfx.gfx_off_delay_work);
|
||||
|
||||
@@ -4880,15 +4878,10 @@ int amdgpu_device_prepare(struct drm_dev
|
||||
continue;
|
||||
r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
|
||||
if (r)
|
||||
- goto unprepare;
|
||||
+ return r;
|
||||
}
|
||||
|
||||
return 0;
|
||||
-
|
||||
-unprepare:
|
||||
- adev->in_s0ix = adev->in_s3 = adev->in_s4 = false;
|
||||
-
|
||||
- return r;
|
||||
}
|
||||
|
||||
/**
|
@@ -1,87 +0,0 @@
|
||||
From e9ee1b0a41166033eda14d11823826b79ce5131b Mon Sep 17 00:00:00 2001
|
||||
From: Alex Deucher <alexander.deucher@amd.com>
|
||||
Date: Thu, 1 May 2025 13:46:46 -0400
|
||||
Subject: drm/amdgpu: fix pm notifier handling
|
||||
|
||||
Set the s3/s0ix and s4 flags in the pm notifier so that we can skip
|
||||
the resource evictions properly in pm prepare based on whether
|
||||
we are suspending or hibernating. Drop the eviction as processes
|
||||
are not frozen at this time, we we can end up getting stuck trying
|
||||
to evict VRAM while applications continue to submit work which
|
||||
causes the buffers to get pulled back into VRAM.
|
||||
|
||||
v2: Move suspend flags out of pm notifier (Mario)
|
||||
|
||||
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4178
|
||||
Fixes: 2965e6355dcd ("drm/amd: Add Suspend/Hibernate notification callback support")
|
||||
Cc: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
|
||||
---
|
||||
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 +++++-------------
|
||||
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +---------
|
||||
2 files changed, 6 insertions(+), 22 deletions(-)
|
||||
|
||||
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
|
||||
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
|
||||
@@ -4819,28 +4819,20 @@ static int amdgpu_device_evict_resources
|
||||
* @data: data
|
||||
*
|
||||
* This function is called when the system is about to suspend or hibernate.
|
||||
- * It is used to evict resources from the device before the system goes to
|
||||
- * sleep while there is still access to swap.
|
||||
+ * It is used to set the appropriate flags so that eviction can be optimized
|
||||
+ * in the pm prepare callback.
|
||||
*/
|
||||
static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
|
||||
void *data)
|
||||
{
|
||||
struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
|
||||
- int r;
|
||||
|
||||
switch (mode) {
|
||||
case PM_HIBERNATION_PREPARE:
|
||||
adev->in_s4 = true;
|
||||
- fallthrough;
|
||||
- case PM_SUSPEND_PREPARE:
|
||||
- r = amdgpu_device_evict_resources(adev);
|
||||
- /*
|
||||
- * This is considered non-fatal at this time because
|
||||
- * amdgpu_device_prepare() will also fatally evict resources.
|
||||
- * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
|
||||
- */
|
||||
- if (r)
|
||||
- drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
|
||||
+ break;
|
||||
+ case PM_POST_HIBERNATION:
|
||||
+ adev->in_s4 = false;
|
||||
break;
|
||||
}
|
||||
|
||||
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
|
||||
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
|
||||
@@ -2592,13 +2592,8 @@ static int amdgpu_pmops_freeze(struct de
|
||||
static int amdgpu_pmops_thaw(struct device *dev)
|
||||
{
|
||||
struct drm_device *drm_dev = dev_get_drvdata(dev);
|
||||
- struct amdgpu_device *adev = drm_to_adev(drm_dev);
|
||||
- int r;
|
||||
|
||||
- r = amdgpu_device_resume(drm_dev, true);
|
||||
- adev->in_s4 = false;
|
||||
-
|
||||
- return r;
|
||||
+ return amdgpu_device_resume(drm_dev, true);
|
||||
}
|
||||
|
||||
static int amdgpu_pmops_poweroff(struct device *dev)
|
||||
@@ -2611,9 +2606,6 @@ static int amdgpu_pmops_poweroff(struct
|
||||
static int amdgpu_pmops_restore(struct device *dev)
|
||||
{
|
||||
struct drm_device *drm_dev = dev_get_drvdata(dev);
|
||||
- struct amdgpu_device *adev = drm_to_adev(drm_dev);
|
||||
-
|
||||
- adev->in_s4 = false;
|
||||
|
||||
return amdgpu_device_resume(drm_dev, true);
|
||||
}
|
149
debian/patches/patchset-pf/invlpgb/0001-x86-mm-Make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
vendored
Normal file
149
debian/patches/patchset-pf/invlpgb/0001-x86-mm-Make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
vendored
Normal file
@@ -0,0 +1,149 @@
|
||||
From 2ffeb0d8d193c35403cea13d3b7273b523631007 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Thu, 13 Feb 2025 11:13:52 -0500
|
||||
Subject: x86/mm: Make MMU_GATHER_RCU_TABLE_FREE unconditional
|
||||
|
||||
Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
|
||||
paravirt, and not when running on bare metal.
|
||||
|
||||
There is no real good reason to do things differently for
|
||||
each setup. Make them all the same.
|
||||
|
||||
Currently get_user_pages_fast synchronizes against page table
|
||||
freeing in two different ways:
|
||||
|
||||
- on bare metal, by blocking IRQs, which block TLB flush IPIs
|
||||
- on paravirt, with MMU_GATHER_RCU_TABLE_FREE
|
||||
|
||||
This is done because some paravirt TLB flush implementations
|
||||
handle the TLB flush in the hypervisor, and will do the flush
|
||||
even when the target CPU has interrupts disabled.
|
||||
|
||||
Always handle page table freeing with MMU_GATHER_RCU_TABLE_FREE.
|
||||
Using RCU synchronization between page table freeing and get_user_pages_fast()
|
||||
allows bare metal to also do TLB flushing while interrupts are disabled.
|
||||
|
||||
Various places in the mm do still block IRQs or disable preemption
|
||||
as an implicit way to block RCU frees.
|
||||
|
||||
That makes it safe to use INVLPGB on AMD CPUs.
|
||||
|
||||
Suggested-by: Peter Zijlstra <peterz@infradead.org>
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||||
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
|
||||
Tested-by: Brendan Jackman <jackmanb@google.com>
|
||||
Tested-by: Michael Kelley <mhklinux@outlook.com>
|
||||
Link: https://lore.kernel.org/r/20250213161423.449435-2-riel@surriel.com
|
||||
---
|
||||
arch/x86/Kconfig | 2 +-
|
||||
arch/x86/kernel/paravirt.c | 17 +----------------
|
||||
arch/x86/mm/pgtable.c | 27 ++++-----------------------
|
||||
3 files changed, 6 insertions(+), 40 deletions(-)
|
||||
|
||||
--- a/arch/x86/Kconfig
|
||||
+++ b/arch/x86/Kconfig
|
||||
@@ -277,7 +277,7 @@ config X86
|
||||
select HAVE_PCI
|
||||
select HAVE_PERF_REGS
|
||||
select HAVE_PERF_USER_STACK_DUMP
|
||||
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
|
||||
+ select MMU_GATHER_RCU_TABLE_FREE
|
||||
select MMU_GATHER_MERGE_VMAS
|
||||
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
|
||||
select HAVE_REGS_AND_STACK_ACCESS_API
|
||||
--- a/arch/x86/kernel/paravirt.c
|
||||
+++ b/arch/x86/kernel/paravirt.c
|
||||
@@ -59,21 +59,6 @@ void __init native_pv_lock_init(void)
|
||||
static_branch_enable(&virt_spin_lock_key);
|
||||
}
|
||||
|
||||
-#ifndef CONFIG_PT_RECLAIM
|
||||
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- struct ptdesc *ptdesc = (struct ptdesc *)table;
|
||||
-
|
||||
- pagetable_dtor(ptdesc);
|
||||
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
|
||||
-}
|
||||
-#else
|
||||
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- tlb_remove_table(tlb, table);
|
||||
-}
|
||||
-#endif
|
||||
-
|
||||
struct static_key paravirt_steal_enabled;
|
||||
struct static_key paravirt_steal_rq_enabled;
|
||||
|
||||
@@ -197,7 +182,7 @@ struct paravirt_patch_template pv_ops =
|
||||
.mmu.flush_tlb_kernel = native_flush_tlb_global,
|
||||
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
|
||||
.mmu.flush_tlb_multi = native_flush_tlb_multi,
|
||||
- .mmu.tlb_remove_table = native_tlb_remove_table,
|
||||
+ .mmu.tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.mmu.exit_mmap = paravirt_nop,
|
||||
.mmu.notify_page_enc_status_changed = paravirt_nop,
|
||||
--- a/arch/x86/mm/pgtable.c
|
||||
+++ b/arch/x86/mm/pgtable.c
|
||||
@@ -18,25 +18,6 @@ EXPORT_SYMBOL(physical_mask);
|
||||
#define PGTABLE_HIGHMEM 0
|
||||
#endif
|
||||
|
||||
-#ifndef CONFIG_PARAVIRT
|
||||
-#ifndef CONFIG_PT_RECLAIM
|
||||
-static inline
|
||||
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- struct ptdesc *ptdesc = (struct ptdesc *)table;
|
||||
-
|
||||
- pagetable_dtor(ptdesc);
|
||||
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
|
||||
-}
|
||||
-#else
|
||||
-static inline
|
||||
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- tlb_remove_table(tlb, table);
|
||||
-}
|
||||
-#endif /* !CONFIG_PT_RECLAIM */
|
||||
-#endif /* !CONFIG_PARAVIRT */
|
||||
-
|
||||
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
|
||||
|
||||
pgtable_t pte_alloc_one(struct mm_struct *mm)
|
||||
@@ -64,7 +45,7 @@ early_param("userpte", setup_userpte);
|
||||
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
|
||||
{
|
||||
paravirt_release_pte(page_to_pfn(pte));
|
||||
- paravirt_tlb_remove_table(tlb, page_ptdesc(pte));
|
||||
+ tlb_remove_table(tlb, page_ptdesc(pte));
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 2
|
||||
@@ -78,21 +59,21 @@ void ___pmd_free_tlb(struct mmu_gather *
|
||||
#ifdef CONFIG_X86_PAE
|
||||
tlb->need_flush_all = 1;
|
||||
#endif
|
||||
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd));
|
||||
+ tlb_remove_table(tlb, virt_to_ptdesc(pmd));
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 3
|
||||
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
|
||||
{
|
||||
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
|
||||
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud));
|
||||
+ tlb_remove_table(tlb, virt_to_ptdesc(pud));
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 4
|
||||
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
|
||||
{
|
||||
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
|
||||
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d));
|
||||
+ tlb_remove_table(tlb, virt_to_ptdesc(p4d));
|
||||
}
|
||||
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
|
||||
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
|
89
debian/patches/patchset-pf/invlpgb/0002-x86-mm-Remove-pv_ops.mmu.tlb_remove_table-call.patch
vendored
Normal file
89
debian/patches/patchset-pf/invlpgb/0002-x86-mm-Remove-pv_ops.mmu.tlb_remove_table-call.patch
vendored
Normal file
@@ -0,0 +1,89 @@
|
||||
From aadea0887cca5739137f109eab0e1b38604c8af8 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Thu, 13 Feb 2025 11:13:53 -0500
|
||||
Subject: x86/mm: Remove pv_ops.mmu.tlb_remove_table call
|
||||
|
||||
Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
|
||||
|
||||
Get rid of the indirection by simply calling tlb_remove_table directly,
|
||||
and not going through the paravirt function pointers.
|
||||
|
||||
Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||||
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
|
||||
Tested-by: Brendan Jackman <jackmanb@google.com>
|
||||
Tested-by: Michael Kelley <mhklinux@outlook.com>
|
||||
Link: https://lore.kernel.org/r/20250213161423.449435-3-riel@surriel.com
|
||||
---
|
||||
arch/x86/hyperv/mmu.c | 1 -
|
||||
arch/x86/include/asm/paravirt.h | 5 -----
|
||||
arch/x86/include/asm/paravirt_types.h | 2 --
|
||||
arch/x86/kernel/kvm.c | 1 -
|
||||
arch/x86/kernel/paravirt.c | 1 -
|
||||
arch/x86/xen/mmu_pv.c | 1 -
|
||||
6 files changed, 11 deletions(-)
|
||||
|
||||
--- a/arch/x86/hyperv/mmu.c
|
||||
+++ b/arch/x86/hyperv/mmu.c
|
||||
@@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void)
|
||||
|
||||
pr_info("Using hypercall for remote TLB flush\n");
|
||||
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
|
||||
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
|
||||
}
|
||||
--- a/arch/x86/include/asm/paravirt.h
|
||||
+++ b/arch/x86/include/asm/paravirt.h
|
||||
@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
|
||||
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
|
||||
}
|
||||
|
||||
-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
|
||||
-}
|
||||
-
|
||||
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
|
||||
{
|
||||
PVOP_VCALL1(mmu.exit_mmap, mm);
|
||||
--- a/arch/x86/include/asm/paravirt_types.h
|
||||
+++ b/arch/x86/include/asm/paravirt_types.h
|
||||
@@ -133,8 +133,6 @@ struct pv_mmu_ops {
|
||||
void (*flush_tlb_multi)(const struct cpumask *cpus,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
|
||||
-
|
||||
/* Hook for intercepting the destruction of an mm_struct. */
|
||||
void (*exit_mmap)(struct mm_struct *mm);
|
||||
void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
|
||||
--- a/arch/x86/kernel/kvm.c
|
||||
+++ b/arch/x86/kernel/kvm.c
|
||||
@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
|
||||
#ifdef CONFIG_SMP
|
||||
if (pv_tlb_flush_supported()) {
|
||||
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
|
||||
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
|
||||
pr_info("KVM setup pv remote TLB flush\n");
|
||||
}
|
||||
|
||||
--- a/arch/x86/kernel/paravirt.c
|
||||
+++ b/arch/x86/kernel/paravirt.c
|
||||
@@ -182,7 +182,6 @@ struct paravirt_patch_template pv_ops =
|
||||
.mmu.flush_tlb_kernel = native_flush_tlb_global,
|
||||
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
|
||||
.mmu.flush_tlb_multi = native_flush_tlb_multi,
|
||||
- .mmu.tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.mmu.exit_mmap = paravirt_nop,
|
||||
.mmu.notify_page_enc_status_changed = paravirt_nop,
|
||||
--- a/arch/x86/xen/mmu_pv.c
|
||||
+++ b/arch/x86/xen/mmu_pv.c
|
||||
@@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops
|
||||
.flush_tlb_kernel = xen_flush_tlb,
|
||||
.flush_tlb_one_user = xen_flush_tlb_one_user,
|
||||
.flush_tlb_multi = xen_flush_tlb_multi,
|
||||
- .tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.pgd_alloc = xen_pgd_alloc,
|
||||
.pgd_free = xen_pgd_free,
|
87
debian/patches/patchset-pf/invlpgb/0003-x86-mm-Consolidate-full-flush-threshold-decision.patch
vendored
Normal file
87
debian/patches/patchset-pf/invlpgb/0003-x86-mm-Consolidate-full-flush-threshold-decision.patch
vendored
Normal file
@@ -0,0 +1,87 @@
|
||||
From 170f37d1499a28f7a1902e007111867c7cf0147f Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:36 -0500
|
||||
Subject: x86/mm: Consolidate full flush threshold decision
|
||||
|
||||
Reduce code duplication by consolidating the decision point for whether to do
|
||||
individual invalidations or a full flush inside get_flush_tlb_info().
|
||||
|
||||
Suggested-by: Dave Hansen <dave.hansen@intel.com>
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Acked-by: Dave Hansen <dave.hansen@intel.com>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-2-riel@surriel.com
|
||||
---
|
||||
arch/x86/mm/tlb.c | 41 +++++++++++++++++++----------------------
|
||||
1 file changed, 19 insertions(+), 22 deletions(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1019,6 +1019,15 @@ static struct flush_tlb_info *get_flush_
|
||||
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
|
||||
#endif
|
||||
|
||||
+ /*
|
||||
+ * If the number of flushes is so large that a full flush
|
||||
+ * would be faster, do a full flush.
|
||||
+ */
|
||||
+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
|
||||
+ start = 0;
|
||||
+ end = TLB_FLUSH_ALL;
|
||||
+ }
|
||||
+
|
||||
info->start = start;
|
||||
info->end = end;
|
||||
info->mm = mm;
|
||||
@@ -1045,17 +1054,8 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
bool freed_tables)
|
||||
{
|
||||
struct flush_tlb_info *info;
|
||||
+ int cpu = get_cpu();
|
||||
u64 new_tlb_gen;
|
||||
- int cpu;
|
||||
-
|
||||
- cpu = get_cpu();
|
||||
-
|
||||
- /* Should we flush just the requested range? */
|
||||
- if ((end == TLB_FLUSH_ALL) ||
|
||||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
|
||||
- start = 0;
|
||||
- end = TLB_FLUSH_ALL;
|
||||
- }
|
||||
|
||||
/* This is also a barrier that synchronizes with switch_mm(). */
|
||||
new_tlb_gen = inc_mm_tlb_gen(mm);
|
||||
@@ -1108,22 +1108,19 @@ static void do_kernel_range_flush(void *
|
||||
|
||||
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
{
|
||||
- /* Balance as user space task's flush, a bit conservative */
|
||||
- if (end == TLB_FLUSH_ALL ||
|
||||
- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
|
||||
- on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
- } else {
|
||||
- struct flush_tlb_info *info;
|
||||
+ struct flush_tlb_info *info;
|
||||
|
||||
- preempt_disable();
|
||||
- info = get_flush_tlb_info(NULL, start, end, 0, false,
|
||||
- TLB_GENERATION_INVALID);
|
||||
+ guard(preempt)();
|
||||
|
||||
+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
|
||||
+ TLB_GENERATION_INVALID);
|
||||
+
|
||||
+ if (info->end == TLB_FLUSH_ALL)
|
||||
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+ else
|
||||
on_each_cpu(do_kernel_range_flush, info, 1);
|
||||
|
||||
- put_flush_tlb_info();
|
||||
- preempt_enable();
|
||||
- }
|
||||
+ put_flush_tlb_info();
|
||||
}
|
||||
|
||||
/*
|
103
debian/patches/patchset-pf/invlpgb/0004-x86-mm-Add-INVLPGB-feature-and-Kconfig-entry.patch
vendored
Normal file
103
debian/patches/patchset-pf/invlpgb/0004-x86-mm-Add-INVLPGB-feature-and-Kconfig-entry.patch
vendored
Normal file
@@ -0,0 +1,103 @@
|
||||
From acb5a284db4fa3dbbb246ab8fa58da0143cd68ce Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:37 -0500
|
||||
Subject: x86/mm: Add INVLPGB feature and Kconfig entry
|
||||
|
||||
In addition, the CPU advertises the maximum number of pages that can be
|
||||
shot down with one INVLPGB instruction in CPUID. Save that information
|
||||
for later use.
|
||||
|
||||
[ bp: use cpu_has(), typos, massage. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-3-riel@surriel.com
|
||||
---
|
||||
arch/x86/Kconfig.cpu | 4 ++++
|
||||
arch/x86/include/asm/cpufeatures.h | 1 +
|
||||
arch/x86/include/asm/disabled-features.h | 8 +++++++-
|
||||
arch/x86/include/asm/tlbflush.h | 3 +++
|
||||
arch/x86/kernel/cpu/amd.c | 6 ++++++
|
||||
5 files changed, 21 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/Kconfig.cpu
|
||||
+++ b/arch/x86/Kconfig.cpu
|
||||
@@ -740,6 +740,10 @@ menuconfig PROCESSOR_SELECT
|
||||
This lets you choose what x86 vendor support code your kernel
|
||||
will include.
|
||||
|
||||
+config BROADCAST_TLB_FLUSH
|
||||
+ def_bool y
|
||||
+ depends on CPU_SUP_AMD && 64BIT
|
||||
+
|
||||
config CPU_SUP_INTEL
|
||||
default y
|
||||
bool "Support Intel processors" if PROCESSOR_SELECT
|
||||
--- a/arch/x86/include/asm/cpufeatures.h
|
||||
+++ b/arch/x86/include/asm/cpufeatures.h
|
||||
@@ -338,6 +338,7 @@
|
||||
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
|
||||
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
|
||||
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
|
||||
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */
|
||||
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
|
||||
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
|
||||
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
|
||||
--- a/arch/x86/include/asm/disabled-features.h
|
||||
+++ b/arch/x86/include/asm/disabled-features.h
|
||||
@@ -129,6 +129,12 @@
|
||||
#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31))
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+#define DISABLE_INVLPGB 0
|
||||
+#else
|
||||
+#define DISABLE_INVLPGB (1 << (X86_FEATURE_INVLPGB & 31))
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Make sure to add features to the correct mask
|
||||
*/
|
||||
@@ -146,7 +152,7 @@
|
||||
#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
|
||||
DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
|
||||
#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM)
|
||||
-#define DISABLED_MASK13 0
|
||||
+#define DISABLED_MASK13 (DISABLE_INVLPGB)
|
||||
#define DISABLED_MASK14 0
|
||||
#define DISABLED_MASK15 0
|
||||
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -183,6 +183,9 @@ static inline void cr4_init_shadow(void)
|
||||
extern unsigned long mmu_cr4_features;
|
||||
extern u32 *trampoline_cr4_features;
|
||||
|
||||
+/* How many pages can be invalidated with one INVLPGB. */
|
||||
+extern u16 invlpgb_count_max;
|
||||
+
|
||||
extern void initialize_tlbstate_and_flush(void);
|
||||
|
||||
/*
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -29,6 +29,8 @@
|
||||
|
||||
#include "cpu.h"
|
||||
|
||||
+u16 invlpgb_count_max __ro_after_init;
|
||||
+
|
||||
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
|
||||
{
|
||||
u32 gprs[8] = { 0 };
|
||||
@@ -1145,6 +1147,10 @@ static void cpu_detect_tlb_amd(struct cp
|
||||
tlb_lli_2m[ENTRIES] = eax & mask;
|
||||
|
||||
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
|
||||
+
|
||||
+ /* Max number of pages INVLPGB can invalidate in one shot */
|
||||
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
|
||||
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
|
||||
}
|
||||
|
||||
static const struct cpu_dev amd_cpu_dev = {
|
170
debian/patches/patchset-pf/invlpgb/0005-x86-mm-Add-INVLPGB-support-code.patch
vendored
Normal file
170
debian/patches/patchset-pf/invlpgb/0005-x86-mm-Add-INVLPGB-support-code.patch
vendored
Normal file
@@ -0,0 +1,170 @@
|
||||
From 27bab4a6ed6ee7b7b0e2d216b8802800ef26b2ad Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Fri, 28 Feb 2025 20:32:30 +0100
|
||||
Subject: x86/mm: Add INVLPGB support code
|
||||
|
||||
Add helper functions and definitions needed to use broadcast TLB
|
||||
invalidation on AMD CPUs.
|
||||
|
||||
[ bp:
|
||||
- Cleanup commit message
|
||||
- Improve and expand comments
|
||||
- push the preemption guards inside the invlpgb* helpers
|
||||
- merge improvements from dhansen
|
||||
- add !CONFIG_BROADCAST_TLB_FLUSH function stubs because Clang
|
||||
can't do DCE properly yet and looks at the inline asm and
|
||||
complains about it getting a u64 argument on 32-bit code ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-4-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/tlb.h | 132 +++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 132 insertions(+)
|
||||
|
||||
--- a/arch/x86/include/asm/tlb.h
|
||||
+++ b/arch/x86/include/asm/tlb.h
|
||||
@@ -6,6 +6,9 @@
|
||||
static inline void tlb_flush(struct mmu_gather *tlb);
|
||||
|
||||
#include <asm-generic/tlb.h>
|
||||
+#include <linux/kernel.h>
|
||||
+#include <vdso/bits.h>
|
||||
+#include <vdso/page.h>
|
||||
|
||||
static inline void tlb_flush(struct mmu_gather *tlb)
|
||||
{
|
||||
@@ -25,4 +28,133 @@ static inline void invlpg(unsigned long
|
||||
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
||||
}
|
||||
|
||||
+enum addr_stride {
|
||||
+ PTE_STRIDE = 0,
|
||||
+ PMD_STRIDE = 1
|
||||
+};
|
||||
+
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+/*
|
||||
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
|
||||
+ *
|
||||
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
|
||||
+ * be done in a parallel fashion.
|
||||
+ *
|
||||
+ * The instruction takes the number of extra pages to invalidate, beyond
|
||||
+ * the first page, while __invlpgb gets the more human readable number of
|
||||
+ * pages to invalidate.
|
||||
+ *
|
||||
+ * The bits in rax[0:2] determine respectively which components of the address
|
||||
+ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
|
||||
+ * address in the specified range matches.
|
||||
+ *
|
||||
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
|
||||
+ * this CPU have completed.
|
||||
+ */
|
||||
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
|
||||
+ unsigned long addr, u16 nr_pages,
|
||||
+ enum addr_stride stride, u8 flags)
|
||||
+{
|
||||
+ u32 edx = (pcid << 16) | asid;
|
||||
+ u32 ecx = (stride << 31) | (nr_pages - 1);
|
||||
+ u64 rax = addr | flags;
|
||||
+
|
||||
+ /* The low bits in rax are for flags. Verify addr is clean. */
|
||||
+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
|
||||
+
|
||||
+ /* INVLPGB; supported in binutils >= 2.36. */
|
||||
+ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx));
|
||||
+}
|
||||
+
|
||||
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags)
|
||||
+{
|
||||
+ __invlpgb(asid, pcid, 0, 1, 0, flags);
|
||||
+}
|
||||
+
|
||||
+static inline void __tlbsync(void)
|
||||
+{
|
||||
+ /*
|
||||
+ * TLBSYNC waits for INVLPGB instructions originating on the same CPU
|
||||
+ * to have completed. Print a warning if the task has been migrated,
|
||||
+ * and might not be waiting on all the INVLPGBs issued during this TLB
|
||||
+ * invalidation sequence.
|
||||
+ */
|
||||
+ cant_migrate();
|
||||
+
|
||||
+ /* TLBSYNC: supported in binutils >= 0.36. */
|
||||
+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
|
||||
+}
|
||||
+#else
|
||||
+/* Some compilers (I'm looking at you clang!) simply can't do DCE */
|
||||
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
|
||||
+ unsigned long addr, u16 nr_pages,
|
||||
+ enum addr_stride s, u8 flags) { }
|
||||
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { }
|
||||
+static inline void __tlbsync(void) { }
|
||||
+#endif
|
||||
+
|
||||
+/*
|
||||
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
|
||||
+ * of the three. For example:
|
||||
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
|
||||
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
|
||||
+ *
|
||||
+ * The first is used to invalidate (kernel) mappings at a particular
|
||||
+ * address across all processes.
|
||||
+ *
|
||||
+ * The latter invalidates all TLB entries matching a PCID.
|
||||
+ */
|
||||
+#define INVLPGB_FLAG_VA BIT(0)
|
||||
+#define INVLPGB_FLAG_PCID BIT(1)
|
||||
+#define INVLPGB_FLAG_ASID BIT(2)
|
||||
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
|
||||
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
|
||||
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
|
||||
+
|
||||
+/* The implied mode when all bits are clear: */
|
||||
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
|
||||
+
|
||||
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
|
||||
+ unsigned long addr,
|
||||
+ u16 nr, bool stride)
|
||||
+{
|
||||
+ enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE;
|
||||
+ u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA;
|
||||
+
|
||||
+ __invlpgb(0, pcid, addr, nr, str, flags);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings for a given PCID, not including globals. */
|
||||
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
|
||||
+{
|
||||
+ __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings, including globals, for all PCIDs. */
|
||||
+static inline void invlpgb_flush_all(void)
|
||||
+{
|
||||
+ /*
|
||||
+ * TLBSYNC at the end needs to make sure all flushes done on the
|
||||
+ * current CPU have been executed system-wide. Therefore, make
|
||||
+ * sure nothing gets migrated in-between but disable preemption
|
||||
+ * as it is cheaper.
|
||||
+ */
|
||||
+ guard(preempt)();
|
||||
+ __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL);
|
||||
+ __tlbsync();
|
||||
+}
|
||||
+
|
||||
+/* Flush addr, including globals, for all PCIDs. */
|
||||
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
|
||||
+{
|
||||
+ __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings for all PCIDs except globals. */
|
||||
+static inline void invlpgb_flush_all_nonglobals(void)
|
||||
+{
|
||||
+ guard(preempt)();
|
||||
+ __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS);
|
||||
+ __tlbsync();
|
||||
+}
|
||||
#endif /* _ASM_X86_TLB_H */
|
97
debian/patches/patchset-pf/invlpgb/0006-x86-mm-Use-INVLPGB-for-kernel-TLB-flushes.patch
vendored
Normal file
97
debian/patches/patchset-pf/invlpgb/0006-x86-mm-Use-INVLPGB-for-kernel-TLB-flushes.patch
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
From 358d71638f420efe8f7e05ce74aefe13e9320283 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:39 -0500
|
||||
Subject: x86/mm: Use INVLPGB for kernel TLB flushes
|
||||
|
||||
Use broadcast TLB invalidation for kernel addresses when available.
|
||||
Remove the need to send IPIs for kernel TLB flushes.
|
||||
|
||||
[ bp: Integrate dhansen's comments additions, merge the
|
||||
flush_tlb_all() change into this one too. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-5-riel@surriel.com
|
||||
---
|
||||
arch/x86/mm/tlb.c | 48 +++++++++++++++++++++++++++++++++++++++++++----
|
||||
1 file changed, 44 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1083,7 +1083,6 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
|
||||
}
|
||||
|
||||
-
|
||||
static void do_flush_tlb_all(void *info)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
||||
@@ -1093,7 +1092,32 @@ static void do_flush_tlb_all(void *info)
|
||||
void flush_tlb_all(void)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
||||
- on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+
|
||||
+ /* First try (faster) hardware-assisted TLB invalidation. */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ invlpgb_flush_all();
|
||||
+ else
|
||||
+ /* Fall back to the IPI-based invalidation. */
|
||||
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+}
|
||||
+
|
||||
+/* Flush an arbitrarily large range of memory with INVLPGB. */
|
||||
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ unsigned long addr, nr;
|
||||
+
|
||||
+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
|
||||
+ nr = (info->end - addr) >> PAGE_SHIFT;
|
||||
+
|
||||
+ /*
|
||||
+ * INVLPGB has a limit on the size of ranges it can
|
||||
+ * flush. Break up large flushes.
|
||||
+ */
|
||||
+ nr = clamp_val(nr, 1, invlpgb_count_max);
|
||||
+
|
||||
+ invlpgb_flush_addr_nosync(addr, nr);
|
||||
+ }
|
||||
+ __tlbsync();
|
||||
}
|
||||
|
||||
static void do_kernel_range_flush(void *info)
|
||||
@@ -1106,6 +1130,22 @@ static void do_kernel_range_flush(void *
|
||||
flush_tlb_one_kernel(addr);
|
||||
}
|
||||
|
||||
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ invlpgb_flush_all();
|
||||
+ else
|
||||
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+}
|
||||
+
|
||||
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ invlpgb_kernel_range_flush(info);
|
||||
+ else
|
||||
+ on_each_cpu(do_kernel_range_flush, info, 1);
|
||||
+}
|
||||
+
|
||||
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
{
|
||||
struct flush_tlb_info *info;
|
||||
@@ -1116,9 +1156,9 @@ void flush_tlb_kernel_range(unsigned lon
|
||||
TLB_GENERATION_INVALID);
|
||||
|
||||
if (info->end == TLB_FLUSH_ALL)
|
||||
- on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+ kernel_tlb_flush_all(info);
|
||||
else
|
||||
- on_each_cpu(do_kernel_range_flush, info, 1);
|
||||
+ kernel_tlb_flush_range(info);
|
||||
|
||||
put_flush_tlb_info();
|
||||
}
|
32
debian/patches/patchset-pf/invlpgb/0007-x86-mm-Use-broadcast-TLB-flushing-in-page-reclaim.patch
vendored
Normal file
32
debian/patches/patchset-pf/invlpgb/0007-x86-mm-Use-broadcast-TLB-flushing-in-page-reclaim.patch
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
From 7cf099de79e12d6c4949f733c8cbb241bb08f07a Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:41 -0500
|
||||
Subject: x86/mm: Use broadcast TLB flushing in page reclaim
|
||||
|
||||
Page reclaim tracks only the CPU(s) where the TLB needs to be flushed, rather
|
||||
than all the individual mappings that may be getting invalidated.
|
||||
|
||||
Use broadcast TLB flushing when that is available.
|
||||
|
||||
[ bp: Massage commit message. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-7-riel@surriel.com
|
||||
---
|
||||
arch/x86/mm/tlb.c | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1339,7 +1339,9 @@ void arch_tlbbatch_flush(struct arch_tlb
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ invlpgb_flush_all_nonglobals();
|
||||
+ } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
flush_tlb_multi(&batch->cpumask, info);
|
||||
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
||||
lockdep_assert_irqs_enabled();
|
286
debian/patches/patchset-pf/invlpgb/0008-x86-mm-Add-global-ASID-allocation-helper-functions.patch
vendored
Normal file
286
debian/patches/patchset-pf/invlpgb/0008-x86-mm-Add-global-ASID-allocation-helper-functions.patch
vendored
Normal file
@@ -0,0 +1,286 @@
|
||||
From f9ecaaca7ac26789d7d3e0d8022b7c99599dc8a3 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:42 -0500
|
||||
Subject: x86/mm: Add global ASID allocation helper functions
|
||||
|
||||
Add functions to manage global ASID space. Multithreaded processes that are
|
||||
simultaneously active on 4 or more CPUs can get a global ASID, resulting in the
|
||||
same PCID being used for that process on every CPU.
|
||||
|
||||
This in turn will allow the kernel to use hardware-assisted TLB flushing
|
||||
through AMD INVLPGB or Intel RAR for these processes.
|
||||
|
||||
[ bp:
|
||||
- Extend use_global_asid() comment
|
||||
- s/X86_BROADCAST_TLB_FLUSH/BROADCAST_TLB_FLUSH/g
|
||||
- other touchups ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-8-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/mmu.h | 12 +++
|
||||
arch/x86/include/asm/mmu_context.h | 2 +
|
||||
arch/x86/include/asm/tlbflush.h | 37 +++++++
|
||||
arch/x86/mm/tlb.c | 154 ++++++++++++++++++++++++++++-
|
||||
4 files changed, 202 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/mmu.h
|
||||
+++ b/arch/x86/include/asm/mmu.h
|
||||
@@ -69,6 +69,18 @@ typedef struct {
|
||||
u16 pkey_allocation_map;
|
||||
s16 execute_only_pkey;
|
||||
#endif
|
||||
+
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+ /*
|
||||
+ * The global ASID will be a non-zero value when the process has
|
||||
+ * the same ASID across all CPUs, allowing it to make use of
|
||||
+ * hardware-assisted remote TLB invalidation like AMD INVLPGB.
|
||||
+ */
|
||||
+ u16 global_asid;
|
||||
+
|
||||
+ /* The process is transitioning to a new global ASID number. */
|
||||
+ bool asid_transition;
|
||||
+#endif
|
||||
} mm_context_t;
|
||||
|
||||
#define INIT_MM_CONTEXT(mm) \
|
||||
--- a/arch/x86/include/asm/mmu_context.h
|
||||
+++ b/arch/x86/include/asm/mmu_context.h
|
||||
@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
|
||||
#define enter_lazy_tlb enter_lazy_tlb
|
||||
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
|
||||
|
||||
+extern void mm_free_global_asid(struct mm_struct *mm);
|
||||
+
|
||||
/*
|
||||
* Init a new mm. Used on mm copies, like at fork()
|
||||
* and on mm's that are brand-new, like at execve().
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -6,6 +6,7 @@
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
+#include <asm/barrier.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/special_insns.h>
|
||||
@@ -234,6 +235,42 @@ void flush_tlb_one_kernel(unsigned long
|
||||
void flush_tlb_multi(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
+static inline bool is_dyn_asid(u16 asid)
|
||||
+{
|
||||
+ return asid < TLB_NR_DYN_ASIDS;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+static inline u16 mm_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ u16 asid;
|
||||
+
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return 0;
|
||||
+
|
||||
+ asid = smp_load_acquire(&mm->context.global_asid);
|
||||
+
|
||||
+ /* mm->context.global_asid is either 0, or a global ASID */
|
||||
+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
|
||||
+
|
||||
+ return asid;
|
||||
+}
|
||||
+
|
||||
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
|
||||
+{
|
||||
+ /*
|
||||
+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
|
||||
+ * finish_asid_transition() needs to observe asid_transition = true
|
||||
+ * once it observes global_asid.
|
||||
+ */
|
||||
+ mm->context.asid_transition = true;
|
||||
+ smp_store_release(&mm->context.global_asid, asid);
|
||||
+}
|
||||
+#else
|
||||
+static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
|
||||
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
|
||||
+#endif /* CONFIG_BROADCAST_TLB_FLUSH */
|
||||
+
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
#include <asm/paravirt.h>
|
||||
#endif
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -74,13 +74,15 @@
|
||||
* use different names for each of them:
|
||||
*
|
||||
* ASID - [0, TLB_NR_DYN_ASIDS-1]
|
||||
- * the canonical identifier for an mm
|
||||
+ * the canonical identifier for an mm, dynamically allocated on each CPU
|
||||
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
|
||||
+ * the canonical, global identifier for an mm, identical across all CPUs
|
||||
*
|
||||
- * kPCID - [1, TLB_NR_DYN_ASIDS]
|
||||
+ * kPCID - [1, MAX_ASID_AVAILABLE]
|
||||
* the value we write into the PCID part of CR3; corresponds to the
|
||||
* ASID+1, because PCID 0 is special.
|
||||
*
|
||||
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
|
||||
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
|
||||
* for KPTI each mm has two address spaces and thus needs two
|
||||
* PCID values, but we can still do with a single ASID denomination
|
||||
* for each mm. Corresponds to kPCID + 2048.
|
||||
@@ -252,6 +254,152 @@ static void choose_new_asid(struct mm_st
|
||||
}
|
||||
|
||||
/*
|
||||
+ * Global ASIDs are allocated for multi-threaded processes that are
|
||||
+ * active on multiple CPUs simultaneously, giving each of those
|
||||
+ * processes the same PCID on every CPU, for use with hardware-assisted
|
||||
+ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
|
||||
+ *
|
||||
+ * These global ASIDs are held for the lifetime of the process.
|
||||
+ */
|
||||
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
|
||||
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
|
||||
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
|
||||
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
|
||||
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
|
||||
+
|
||||
+/*
|
||||
+ * When the search for a free ASID in the global ASID space reaches
|
||||
+ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
|
||||
+ * freed global ASIDs are safe to re-use.
|
||||
+ *
|
||||
+ * This way the global flush only needs to happen at ASID rollover
|
||||
+ * time, and not at ASID allocation time.
|
||||
+ */
|
||||
+static void reset_global_asid_space(void)
|
||||
+{
|
||||
+ lockdep_assert_held(&global_asid_lock);
|
||||
+
|
||||
+ invlpgb_flush_all_nonglobals();
|
||||
+
|
||||
+ /*
|
||||
+ * The TLB flush above makes it safe to re-use the previously
|
||||
+ * freed global ASIDs.
|
||||
+ */
|
||||
+ bitmap_andnot(global_asid_used, global_asid_used,
|
||||
+ global_asid_freed, MAX_ASID_AVAILABLE);
|
||||
+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
|
||||
+
|
||||
+ /* Restart the search from the start of global ASID space. */
|
||||
+ last_global_asid = TLB_NR_DYN_ASIDS;
|
||||
+}
|
||||
+
|
||||
+static u16 allocate_global_asid(void)
|
||||
+{
|
||||
+ u16 asid;
|
||||
+
|
||||
+ lockdep_assert_held(&global_asid_lock);
|
||||
+
|
||||
+ /* The previous allocation hit the edge of available address space */
|
||||
+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
|
||||
+ reset_global_asid_space();
|
||||
+
|
||||
+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
|
||||
+
|
||||
+ if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
|
||||
+ /* This should never happen. */
|
||||
+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
|
||||
+ global_asid_available);
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ /* Claim this global ASID. */
|
||||
+ __set_bit(asid, global_asid_used);
|
||||
+ last_global_asid = asid;
|
||||
+ global_asid_available--;
|
||||
+ return asid;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Check whether a process is currently active on more than @threshold CPUs.
|
||||
+ * This is a cheap estimation on whether or not it may make sense to assign
|
||||
+ * a global ASID to this process, and use broadcast TLB invalidation.
|
||||
+ */
|
||||
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
|
||||
+{
|
||||
+ int count = 0;
|
||||
+ int cpu;
|
||||
+
|
||||
+ /* This quick check should eliminate most single threaded programs. */
|
||||
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
|
||||
+ return false;
|
||||
+
|
||||
+ /* Slower check to make sure. */
|
||||
+ for_each_cpu(cpu, mm_cpumask(mm)) {
|
||||
+ /* Skip the CPUs that aren't really running this process. */
|
||||
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
|
||||
+ continue;
|
||||
+
|
||||
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
|
||||
+ continue;
|
||||
+
|
||||
+ if (++count > threshold)
|
||||
+ return true;
|
||||
+ }
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Assign a global ASID to the current process, protecting against
|
||||
+ * races between multiple threads in the process.
|
||||
+ */
|
||||
+static void use_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ u16 asid;
|
||||
+
|
||||
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
|
||||
+
|
||||
+ /* This process is already using broadcast TLB invalidation. */
|
||||
+ if (mm_global_asid(mm))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * The last global ASID was consumed while waiting for the lock.
|
||||
+ *
|
||||
+ * If this fires, a more aggressive ASID reuse scheme might be
|
||||
+ * needed.
|
||||
+ */
|
||||
+ if (!global_asid_available) {
|
||||
+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ asid = allocate_global_asid();
|
||||
+ if (!asid)
|
||||
+ return;
|
||||
+
|
||||
+ mm_assign_global_asid(mm, asid);
|
||||
+}
|
||||
+
|
||||
+void mm_free_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return;
|
||||
+
|
||||
+ if (!mm_global_asid(mm))
|
||||
+ return;
|
||||
+
|
||||
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
|
||||
+
|
||||
+ /* The global ASID can be re-used only after flush at wrap-around. */
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+ __set_bit(mm->context.global_asid, global_asid_freed);
|
||||
+
|
||||
+ mm->context.global_asid = 0;
|
||||
+ global_asid_available++;
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
*
|
219
debian/patches/patchset-pf/invlpgb/0009-x86-mm-Handle-global-ASID-context-switch-and-TLB-flu.patch
vendored
Normal file
219
debian/patches/patchset-pf/invlpgb/0009-x86-mm-Handle-global-ASID-context-switch-and-TLB-flu.patch
vendored
Normal file
@@ -0,0 +1,219 @@
|
||||
From b56070b9f121507cabe352e03f0c534db2d5adc7 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:43 -0500
|
||||
Subject: x86/mm: Handle global ASID context switch and TLB flush
|
||||
|
||||
Do context switch and TLB flush support for processes that use a global
|
||||
ASID and PCID across all CPUs.
|
||||
|
||||
At both context switch time and TLB flush time, it needs to be checked whether
|
||||
a task is switching to a global ASID, and, if so, reload the TLB with the new
|
||||
ASID as appropriate.
|
||||
|
||||
In both code paths, the TLB flush is avoided if a global ASID is used, because
|
||||
the global ASIDs are always kept up to date across CPUs, even when the
|
||||
process is not running on a CPU.
|
||||
|
||||
[ bp:
|
||||
- Massage
|
||||
- :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
|
||||
]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-9-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/tlbflush.h | 14 ++++++
|
||||
arch/x86/mm/tlb.c | 77 ++++++++++++++++++++++++++++++---
|
||||
2 files changed, 84 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -240,6 +240,11 @@ static inline bool is_dyn_asid(u16 asid)
|
||||
return asid < TLB_NR_DYN_ASIDS;
|
||||
}
|
||||
|
||||
+static inline bool is_global_asid(u16 asid)
|
||||
+{
|
||||
+ return !is_dyn_asid(asid);
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
static inline u16 mm_global_asid(struct mm_struct *mm)
|
||||
{
|
||||
@@ -266,9 +271,18 @@ static inline void mm_assign_global_asid
|
||||
mm->context.asid_transition = true;
|
||||
smp_store_release(&mm->context.global_asid, asid);
|
||||
}
|
||||
+
|
||||
+static inline bool mm_in_asid_transition(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return false;
|
||||
+
|
||||
+ return mm && READ_ONCE(mm->context.asid_transition);
|
||||
+}
|
||||
#else
|
||||
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
|
||||
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
|
||||
+static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
|
||||
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -227,6 +227,20 @@ static void choose_new_asid(struct mm_st
|
||||
return;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * TLB consistency for global ASIDs is maintained with hardware assisted
|
||||
+ * remote TLB flushing. Global ASIDs are always up to date.
|
||||
+ */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ u16 global_asid = mm_global_asid(next);
|
||||
+
|
||||
+ if (global_asid) {
|
||||
+ *new_asid = global_asid;
|
||||
+ *need_flush = false;
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (this_cpu_read(cpu_tlbstate.invalidate_other))
|
||||
clear_asid_other();
|
||||
|
||||
@@ -400,6 +414,23 @@ void mm_free_global_asid(struct mm_struc
|
||||
}
|
||||
|
||||
/*
|
||||
+ * Is the mm transitioning from a CPU-local ASID to a global ASID?
|
||||
+ */
|
||||
+static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
|
||||
+{
|
||||
+ u16 global_asid = mm_global_asid(mm);
|
||||
+
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return false;
|
||||
+
|
||||
+ /* Process is transitioning to a global ASID */
|
||||
+ if (global_asid && asid != global_asid)
|
||||
+ return true;
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
*
|
||||
@@ -704,7 +735,8 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
*/
|
||||
if (prev == next) {
|
||||
/* Not actually switching mm's */
|
||||
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
+ VM_WARN_ON(is_dyn_asid(prev_asid) &&
|
||||
+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
next->context.ctx_id);
|
||||
|
||||
/*
|
||||
@@ -721,6 +753,20 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
!cpumask_test_cpu(cpu, mm_cpumask(next))))
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
+ /* Check if the current mm is transitioning to a global ASID */
|
||||
+ if (mm_needs_global_asid(next, prev_asid)) {
|
||||
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
+ goto reload_tlb;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Broadcast TLB invalidation keeps this ASID up to date
|
||||
+ * all the time.
|
||||
+ */
|
||||
+ if (is_global_asid(prev_asid))
|
||||
+ return;
|
||||
+
|
||||
/*
|
||||
* If the CPU is not in lazy TLB mode, we are just switching
|
||||
* from one thread in a process to another thread in the same
|
||||
@@ -755,6 +801,13 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
cond_mitigation(tsk);
|
||||
|
||||
/*
|
||||
+ * Let nmi_uaccess_okay() and finish_asid_transition()
|
||||
+ * know that CR3 is changing.
|
||||
+ */
|
||||
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
||||
+ barrier();
|
||||
+
|
||||
+ /*
|
||||
* Leave this CPU in prev's mm_cpumask. Atomic writes to
|
||||
* mm_cpumask can be expensive under contention. The CPU
|
||||
* will be removed lazily at TLB flush time.
|
||||
@@ -768,18 +821,12 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
|
||||
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
-
|
||||
- /*
|
||||
- * Indicate that CR3 is about to change. nmi_uaccess_okay()
|
||||
- * and others are sensitive to the window where mm_cpumask(),
|
||||
- * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
|
||||
- */
|
||||
- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
||||
- barrier();
|
||||
}
|
||||
|
||||
+reload_tlb:
|
||||
new_lam = mm_lam_cr3_mask(next);
|
||||
if (need_flush) {
|
||||
+ VM_WARN_ON_ONCE(is_global_asid(new_asid));
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
|
||||
@@ -898,7 +945,7 @@ static void flush_tlb_func(void *info)
|
||||
const struct flush_tlb_info *f = info;
|
||||
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
||||
+ u64 local_tlb_gen;
|
||||
bool local = smp_processor_id() == f->initiating_cpu;
|
||||
unsigned long nr_invalidate = 0;
|
||||
u64 mm_tlb_gen;
|
||||
@@ -921,6 +968,16 @@ static void flush_tlb_func(void *info)
|
||||
if (unlikely(loaded_mm == &init_mm))
|
||||
return;
|
||||
|
||||
+ /* Reload the ASID if transitioning into or out of a global ASID */
|
||||
+ if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
|
||||
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
|
||||
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
+ }
|
||||
+
|
||||
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
|
||||
+ if (is_global_asid(loaded_mm_asid))
|
||||
+ return;
|
||||
+
|
||||
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
|
||||
loaded_mm->context.ctx_id);
|
||||
|
||||
@@ -938,6 +995,8 @@ static void flush_tlb_func(void *info)
|
||||
return;
|
||||
}
|
||||
|
||||
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
||||
+
|
||||
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
|
||||
f->new_tlb_gen <= local_tlb_gen)) {
|
||||
/*
|
||||
@@ -1120,7 +1179,7 @@ STATIC_NOPV void native_flush_tlb_multi(
|
||||
* up on the new contents of what used to be page tables, while
|
||||
* doing a speculative memory access.
|
||||
*/
|
||||
- if (info->freed_tables)
|
||||
+ if (info->freed_tables || mm_in_asid_transition(info->mm))
|
||||
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
|
||||
else
|
||||
on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
|
88
debian/patches/patchset-pf/invlpgb/0010-x86-mm-Add-global-ASID-process-exit-helpers.patch
vendored
Normal file
88
debian/patches/patchset-pf/invlpgb/0010-x86-mm-Add-global-ASID-process-exit-helpers.patch
vendored
Normal file
@@ -0,0 +1,88 @@
|
||||
From 6d3b8545e2c3c638363fb449a99b5a6cbab87a49 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:44 -0500
|
||||
Subject: x86/mm: Add global ASID process exit helpers
|
||||
|
||||
A global ASID is allocated for the lifetime of a process. Free the global ASID
|
||||
at process exit time.
|
||||
|
||||
[ bp: Massage, create helpers, hide details inside them. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-10-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/mmu_context.h | 8 +++++++-
|
||||
arch/x86/include/asm/tlbflush.h | 9 +++++++++
|
||||
2 files changed, 16 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/include/asm/mmu_context.h
|
||||
+++ b/arch/x86/include/asm/mmu_context.h
|
||||
@@ -2,7 +2,6 @@
|
||||
#ifndef _ASM_X86_MMU_CONTEXT_H
|
||||
#define _ASM_X86_MMU_CONTEXT_H
|
||||
|
||||
-#include <asm/desc.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/pkeys.h>
|
||||
@@ -13,6 +12,7 @@
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/debugreg.h>
|
||||
#include <asm/gsseg.h>
|
||||
+#include <asm/desc.h>
|
||||
|
||||
extern atomic64_t last_mm_ctx_id;
|
||||
|
||||
@@ -139,6 +139,9 @@ static inline void mm_reset_untag_mask(s
|
||||
#define enter_lazy_tlb enter_lazy_tlb
|
||||
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
|
||||
|
||||
+#define mm_init_global_asid mm_init_global_asid
|
||||
+extern void mm_init_global_asid(struct mm_struct *mm);
|
||||
+
|
||||
extern void mm_free_global_asid(struct mm_struct *mm);
|
||||
|
||||
/*
|
||||
@@ -163,6 +166,8 @@ static inline int init_new_context(struc
|
||||
mm->context.execute_only_pkey = -1;
|
||||
}
|
||||
#endif
|
||||
+
|
||||
+ mm_init_global_asid(mm);
|
||||
mm_reset_untag_mask(mm);
|
||||
init_new_context_ldt(mm);
|
||||
return 0;
|
||||
@@ -172,6 +177,7 @@ static inline int init_new_context(struc
|
||||
static inline void destroy_context(struct mm_struct *mm)
|
||||
{
|
||||
destroy_context_ldt(mm);
|
||||
+ mm_free_global_asid(mm);
|
||||
}
|
||||
|
||||
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -261,6 +261,14 @@ static inline u16 mm_global_asid(struct
|
||||
return asid;
|
||||
}
|
||||
|
||||
+static inline void mm_init_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ mm->context.global_asid = 0;
|
||||
+ mm->context.asid_transition = false;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
|
||||
{
|
||||
/*
|
||||
@@ -281,6 +289,7 @@ static inline bool mm_in_asid_transition
|
||||
}
|
||||
#else
|
||||
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
|
||||
+static inline void mm_init_global_asid(struct mm_struct *mm) { }
|
||||
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
|
||||
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
|
||||
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
|
219
debian/patches/patchset-pf/invlpgb/0011-x86-mm-Enable-broadcast-TLB-invalidation-for-multi-t.patch
vendored
Normal file
219
debian/patches/patchset-pf/invlpgb/0011-x86-mm-Enable-broadcast-TLB-invalidation-for-multi-t.patch
vendored
Normal file
@@ -0,0 +1,219 @@
|
||||
From 077e9ceb65f514ea63afc65cce86ce8677e77012 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:45 -0500
|
||||
Subject: x86/mm: Enable broadcast TLB invalidation for multi-threaded
|
||||
processes
|
||||
|
||||
There is not enough room in the 12-bit ASID address space to hand out
|
||||
broadcast ASIDs to every process. Only hand out broadcast ASIDs to processes
|
||||
when they are observed to be simultaneously running on 4 or more CPUs.
|
||||
|
||||
This also allows single threaded process to continue using the cheaper, local
|
||||
TLB invalidation instructions like INVLPGB.
|
||||
|
||||
Due to the structure of flush_tlb_mm_range(), the INVLPGB flushing is done in
|
||||
a generically named broadcast_tlb_flush() function which can later also be
|
||||
used for Intel RAR.
|
||||
|
||||
Combined with the removal of unnecessary lru_add_drain calls() (see
|
||||
https://lore.kernel.org/r/20241219153253.3da9e8aa@fangorn) this results in
|
||||
a nice performance boost for the will-it-scale tlb_flush2_threads test on an
|
||||
AMD Milan system with 36 cores:
|
||||
|
||||
- vanilla kernel: 527k loops/second
|
||||
- lru_add_drain removal: 731k loops/second
|
||||
- only INVLPGB: 527k loops/second
|
||||
- lru_add_drain + INVLPGB: 1157k loops/second
|
||||
|
||||
Profiling with only the INVLPGB changes showed while TLB invalidation went
|
||||
down from 40% of the total CPU time to only around 4% of CPU time, the
|
||||
contention simply moved to the LRU lock.
|
||||
|
||||
Fixing both at the same time about doubles the number of iterations per second
|
||||
from this case.
|
||||
|
||||
Comparing will-it-scale tlb_flush2_threads with several different numbers of
|
||||
threads on a 72 CPU AMD Milan shows similar results. The number represents the
|
||||
total number of loops per second across all the threads:
|
||||
|
||||
threads tip INVLPGB
|
||||
|
||||
1 315k 304k
|
||||
2 423k 424k
|
||||
4 644k 1032k
|
||||
8 652k 1267k
|
||||
16 737k 1368k
|
||||
32 759k 1199k
|
||||
64 636k 1094k
|
||||
72 609k 993k
|
||||
|
||||
1 and 2 thread performance is similar with and without INVLPGB, because
|
||||
INVLPGB is only used on processes using 4 or more CPUs simultaneously.
|
||||
|
||||
The number is the median across 5 runs.
|
||||
|
||||
Some numbers closer to real world performance can be found at Phoronix, thanks
|
||||
to Michael:
|
||||
|
||||
https://www.phoronix.com/news/AMD-INVLPGB-Linux-Benefits
|
||||
|
||||
[ bp:
|
||||
- Massage
|
||||
- :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
|
||||
- :%s/\<clear_asid_transition\>/mm_clear_asid_transition/cgi
|
||||
- Fold in a 0day bot fix: https://lore.kernel.org/oe-kbuild-all/202503040000.GtiWUsBm-lkp@intel.com
|
||||
]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-11-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/tlbflush.h | 6 ++
|
||||
arch/x86/mm/tlb.c | 104 +++++++++++++++++++++++++++++++-
|
||||
2 files changed, 109 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -280,6 +280,11 @@ static inline void mm_assign_global_asid
|
||||
smp_store_release(&mm->context.global_asid, asid);
|
||||
}
|
||||
|
||||
+static inline void mm_clear_asid_transition(struct mm_struct *mm)
|
||||
+{
|
||||
+ WRITE_ONCE(mm->context.asid_transition, false);
|
||||
+}
|
||||
+
|
||||
static inline bool mm_in_asid_transition(struct mm_struct *mm)
|
||||
{
|
||||
if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
@@ -291,6 +296,7 @@ static inline bool mm_in_asid_transition
|
||||
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
|
||||
static inline void mm_init_global_asid(struct mm_struct *mm) { }
|
||||
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
|
||||
+static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
|
||||
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
|
||||
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -431,6 +431,105 @@ static bool mm_needs_global_asid(struct
|
||||
}
|
||||
|
||||
/*
|
||||
+ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
|
||||
+ * systems have over 8k CPUs. Because of this potential ASID shortage,
|
||||
+ * global ASIDs are handed out to processes that have frequent TLB
|
||||
+ * flushes and are active on 4 or more CPUs simultaneously.
|
||||
+ */
|
||||
+static void consider_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return;
|
||||
+
|
||||
+ /* Check every once in a while. */
|
||||
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Assign a global ASID if the process is active on
|
||||
+ * 4 or more CPUs simultaneously.
|
||||
+ */
|
||||
+ if (mm_active_cpus_exceeds(mm, 3))
|
||||
+ use_global_asid(mm);
|
||||
+}
|
||||
+
|
||||
+static void finish_asid_transition(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ struct mm_struct *mm = info->mm;
|
||||
+ int bc_asid = mm_global_asid(mm);
|
||||
+ int cpu;
|
||||
+
|
||||
+ if (!mm_in_asid_transition(mm))
|
||||
+ return;
|
||||
+
|
||||
+ for_each_cpu(cpu, mm_cpumask(mm)) {
|
||||
+ /*
|
||||
+ * The remote CPU is context switching. Wait for that to
|
||||
+ * finish, to catch the unlikely case of it switching to
|
||||
+ * the target mm with an out of date ASID.
|
||||
+ */
|
||||
+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
|
||||
+ cpu_relax();
|
||||
+
|
||||
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
|
||||
+ continue;
|
||||
+
|
||||
+ /*
|
||||
+ * If at least one CPU is not using the global ASID yet,
|
||||
+ * send a TLB flush IPI. The IPI should cause stragglers
|
||||
+ * to transition soon.
|
||||
+ *
|
||||
+ * This can race with the CPU switching to another task;
|
||||
+ * that results in a (harmless) extra IPI.
|
||||
+ */
|
||||
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
|
||||
+ flush_tlb_multi(mm_cpumask(info->mm), info);
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ /* All the CPUs running this process are using the global ASID. */
|
||||
+ mm_clear_asid_transition(mm);
|
||||
+}
|
||||
+
|
||||
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ bool pmd = info->stride_shift == PMD_SHIFT;
|
||||
+ unsigned long asid = mm_global_asid(info->mm);
|
||||
+ unsigned long addr = info->start;
|
||||
+
|
||||
+ /*
|
||||
+ * TLB flushes with INVLPGB are kicked off asynchronously.
|
||||
+ * The inc_mm_tlb_gen() guarantees page table updates are done
|
||||
+ * before these TLB flushes happen.
|
||||
+ */
|
||||
+ if (info->end == TLB_FLUSH_ALL) {
|
||||
+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
|
||||
+ /* Do any CPUs supporting INVLPGB need PTI? */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_single_pcid_nosync(user_pcid(asid));
|
||||
+ } else do {
|
||||
+ unsigned long nr = 1;
|
||||
+
|
||||
+ if (info->stride_shift <= PMD_SHIFT) {
|
||||
+ nr = (info->end - addr) >> info->stride_shift;
|
||||
+ nr = clamp_val(nr, 1, invlpgb_count_max);
|
||||
+ }
|
||||
+
|
||||
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
|
||||
+
|
||||
+ addr += nr << info->stride_shift;
|
||||
+ } while (addr < info->end);
|
||||
+
|
||||
+ finish_asid_transition(info);
|
||||
+
|
||||
+ /* Wait for the INVLPGBs kicked off above to finish. */
|
||||
+ __tlbsync();
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
*
|
||||
@@ -1275,9 +1374,12 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
|
||||
+ if (mm_global_asid(mm)) {
|
||||
+ broadcast_tlb_flush(info);
|
||||
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
|
||||
info->trim_cpumask = should_trim_cpumask(mm);
|
||||
flush_tlb_multi(mm_cpumask(mm), info);
|
||||
+ consider_global_asid(mm);
|
||||
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
|
||||
lockdep_assert_irqs_enabled();
|
||||
local_irq_disable();
|
83
debian/patches/patchset-pf/invlpgb/0012-x86-mm-Enable-AMD-translation-cache-extensions.patch
vendored
Normal file
83
debian/patches/patchset-pf/invlpgb/0012-x86-mm-Enable-AMD-translation-cache-extensions.patch
vendored
Normal file
@@ -0,0 +1,83 @@
|
||||
From 1994cff363a37aff5b1232ca9f757b02ae244956 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:47 -0500
|
||||
Subject: x86/mm: Enable AMD translation cache extensions
|
||||
|
||||
With AMD TCE (translation cache extensions) only the intermediate mappings
|
||||
that cover the address range zapped by INVLPG / INVLPGB get invalidated,
|
||||
rather than all intermediate mappings getting zapped at every TLB invalidation.
|
||||
|
||||
This can help reduce the TLB miss rate, by keeping more intermediate mappings
|
||||
in the cache.
|
||||
|
||||
From the AMD manual:
|
||||
|
||||
Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit to
|
||||
1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on TLB
|
||||
entries. When this bit is 0, these instructions remove the target PTE from the
|
||||
TLB as well as all upper-level table entries that are cached in the TLB,
|
||||
whether or not they are associated with the target PTE. When this bit is set,
|
||||
these instructions will remove the target PTE and only those upper-level
|
||||
entries that lead to the target PTE in the page table hierarchy, leaving
|
||||
unrelated upper-level entries intact.
|
||||
|
||||
[ bp: use cpu_has()... I know, it is a mess. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-13-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/msr-index.h | 2 ++
|
||||
arch/x86/kernel/cpu/amd.c | 4 ++++
|
||||
tools/arch/x86/include/asm/msr-index.h | 2 ++
|
||||
3 files changed, 8 insertions(+)
|
||||
|
||||
--- a/arch/x86/include/asm/msr-index.h
|
||||
+++ b/arch/x86/include/asm/msr-index.h
|
||||
@@ -25,6 +25,7 @@
|
||||
#define _EFER_SVME 12 /* Enable virtualization */
|
||||
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
|
||||
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
|
||||
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
|
||||
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
|
||||
|
||||
#define EFER_SCE (1<<_EFER_SCE)
|
||||
@@ -34,6 +35,7 @@
|
||||
#define EFER_SVME (1<<_EFER_SVME)
|
||||
#define EFER_LMSLE (1<<_EFER_LMSLE)
|
||||
#define EFER_FFXSR (1<<_EFER_FFXSR)
|
||||
+#define EFER_TCE (1<<_EFER_TCE)
|
||||
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
|
||||
|
||||
/*
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -1081,6 +1081,10 @@ static void init_amd(struct cpuinfo_x86
|
||||
|
||||
/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
|
||||
clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
|
||||
+
|
||||
+ /* Enable Translation Cache Extension */
|
||||
+ if (cpu_has(c, X86_FEATURE_TCE))
|
||||
+ msr_set_bit(MSR_EFER, _EFER_TCE);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
--- a/tools/arch/x86/include/asm/msr-index.h
|
||||
+++ b/tools/arch/x86/include/asm/msr-index.h
|
||||
@@ -25,6 +25,7 @@
|
||||
#define _EFER_SVME 12 /* Enable virtualization */
|
||||
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
|
||||
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
|
||||
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
|
||||
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
|
||||
|
||||
#define EFER_SCE (1<<_EFER_SCE)
|
||||
@@ -34,6 +35,7 @@
|
||||
#define EFER_SVME (1<<_EFER_SVME)
|
||||
#define EFER_LMSLE (1<<_EFER_LMSLE)
|
||||
#define EFER_FFXSR (1<<_EFER_FFXSR)
|
||||
+#define EFER_TCE (1<<_EFER_TCE)
|
||||
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
|
||||
|
||||
/*
|
121
debian/patches/patchset-pf/invlpgb/0013-x86-mm-Always-set-the-ASID-valid-bit-for-the-INVLPGB.patch
vendored
Normal file
121
debian/patches/patchset-pf/invlpgb/0013-x86-mm-Always-set-the-ASID-valid-bit-for-the-INVLPGB.patch
vendored
Normal file
@@ -0,0 +1,121 @@
|
||||
From 5932a2c8122050c4a2f71588778feb0677fe32b4 Mon Sep 17 00:00:00 2001
|
||||
From: Tom Lendacky <thomas.lendacky@amd.com>
|
||||
Date: Tue, 4 Mar 2025 12:59:56 +0100
|
||||
Subject: x86/mm: Always set the ASID valid bit for the INVLPGB instruction
|
||||
|
||||
When executing the INVLPGB instruction on a bare-metal host or hypervisor, if
|
||||
the ASID valid bit is not set, the instruction will flush the TLB entries that
|
||||
match the specified criteria for any ASID, not just the those of the host. If
|
||||
virtual machines are running on the system, this may result in inadvertent
|
||||
flushes of guest TLB entries.
|
||||
|
||||
When executing the INVLPGB instruction in a guest and the INVLPGB instruction is
|
||||
not intercepted by the hypervisor, the hardware will replace the requested ASID
|
||||
with the guest ASID and set the ASID valid bit before doing the broadcast
|
||||
invalidation. Thus a guest is only able to flush its own TLB entries.
|
||||
|
||||
So to limit the host TLB flushing reach, always set the ASID valid bit using an
|
||||
ASID value of 0 (which represents the host/hypervisor). This will will result in
|
||||
the desired effect in both host and guest.
|
||||
|
||||
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250304120449.GHZ8bsYYyEBOKQIxBm@fat_crate.local
|
||||
---
|
||||
arch/x86/include/asm/tlb.h | 58 +++++++++++++++++++++-----------------
|
||||
1 file changed, 32 insertions(+), 26 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlb.h
|
||||
+++ b/arch/x86/include/asm/tlb.h
|
||||
@@ -33,6 +33,27 @@ enum addr_stride {
|
||||
PMD_STRIDE = 1
|
||||
};
|
||||
|
||||
+/*
|
||||
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
|
||||
+ * of the three. For example:
|
||||
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
|
||||
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
|
||||
+ *
|
||||
+ * The first is used to invalidate (kernel) mappings at a particular
|
||||
+ * address across all processes.
|
||||
+ *
|
||||
+ * The latter invalidates all TLB entries matching a PCID.
|
||||
+ */
|
||||
+#define INVLPGB_FLAG_VA BIT(0)
|
||||
+#define INVLPGB_FLAG_PCID BIT(1)
|
||||
+#define INVLPGB_FLAG_ASID BIT(2)
|
||||
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
|
||||
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
|
||||
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
|
||||
+
|
||||
+/* The implied mode when all bits are clear: */
|
||||
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
|
||||
+
|
||||
#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
/*
|
||||
* INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
|
||||
@@ -40,14 +61,20 @@ enum addr_stride {
|
||||
* The INVLPGB instruction is weakly ordered, and a batch of invalidations can
|
||||
* be done in a parallel fashion.
|
||||
*
|
||||
- * The instruction takes the number of extra pages to invalidate, beyond
|
||||
- * the first page, while __invlpgb gets the more human readable number of
|
||||
- * pages to invalidate.
|
||||
+ * The instruction takes the number of extra pages to invalidate, beyond the
|
||||
+ * first page, while __invlpgb gets the more human readable number of pages to
|
||||
+ * invalidate.
|
||||
*
|
||||
* The bits in rax[0:2] determine respectively which components of the address
|
||||
* (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
|
||||
* address in the specified range matches.
|
||||
*
|
||||
+ * Since it is desired to only flush TLB entries for the ASID that is executing
|
||||
+ * the instruction (a host/hypervisor or a guest), the ASID valid bit should
|
||||
+ * always be set. On a host/hypervisor, the hardware will use the ASID value
|
||||
+ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will
|
||||
+ * use the actual ASID value of the guest.
|
||||
+ *
|
||||
* TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
|
||||
* this CPU have completed.
|
||||
*/
|
||||
@@ -55,9 +82,9 @@ static inline void __invlpgb(unsigned lo
|
||||
unsigned long addr, u16 nr_pages,
|
||||
enum addr_stride stride, u8 flags)
|
||||
{
|
||||
- u32 edx = (pcid << 16) | asid;
|
||||
+ u64 rax = addr | flags | INVLPGB_FLAG_ASID;
|
||||
u32 ecx = (stride << 31) | (nr_pages - 1);
|
||||
- u64 rax = addr | flags;
|
||||
+ u32 edx = (pcid << 16) | asid;
|
||||
|
||||
/* The low bits in rax are for flags. Verify addr is clean. */
|
||||
VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
|
||||
@@ -93,27 +120,6 @@ static inline void __invlpgb_all(unsigne
|
||||
static inline void __tlbsync(void) { }
|
||||
#endif
|
||||
|
||||
-/*
|
||||
- * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
|
||||
- * of the three. For example:
|
||||
- * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
|
||||
- * - FLAG_PCID: invalidate all TLB entries matching the PCID
|
||||
- *
|
||||
- * The first is used to invalidate (kernel) mappings at a particular
|
||||
- * address across all processes.
|
||||
- *
|
||||
- * The latter invalidates all TLB entries matching a PCID.
|
||||
- */
|
||||
-#define INVLPGB_FLAG_VA BIT(0)
|
||||
-#define INVLPGB_FLAG_PCID BIT(1)
|
||||
-#define INVLPGB_FLAG_ASID BIT(2)
|
||||
-#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
|
||||
-#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
|
||||
-#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
|
||||
-
|
||||
-/* The implied mode when all bits are clear: */
|
||||
-#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
|
||||
-
|
||||
static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
|
||||
unsigned long addr,
|
||||
u16 nr, bool stride)
|
@@ -0,0 +1,70 @@
|
||||
From 0e0a5ca37a8e3b06f450f4093ba1b6d6f33c2161 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 19 Mar 2025 13:25:20 -0400
|
||||
Subject: x86/mm: Only do broadcast flush from reclaim if pages were unmapped
|
||||
|
||||
Track whether pages were unmapped from any MM (even ones with a currently
|
||||
empty mm_cpumask) by the reclaim code, to figure out whether or not
|
||||
broadcast TLB flush should be done when reclaim finishes.
|
||||
|
||||
The reason any MM must be tracked, and not only ones contributing to the
|
||||
tlbbatch cpumask, is that broadcast ASIDs are expected to be kept up to
|
||||
date even on CPUs where the MM is not currently active.
|
||||
|
||||
This change allows reclaim to avoid doing TLB flushes when only clean page
|
||||
cache pages and/or slab memory were reclaimed, which is fairly common.
|
||||
|
||||
( This is a simpler alternative to the code that was in my INVLPGB series
|
||||
before, and it seems to capture most of the benefit due to how common
|
||||
it is to reclaim only page cache. )
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Andy Lutomirski <luto@kernel.org>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Link: https://lore.kernel.org/r/20250319132520.6b10ad90@fangorn
|
||||
---
|
||||
arch/x86/include/asm/tlbbatch.h | 5 +++++
|
||||
arch/x86/include/asm/tlbflush.h | 1 +
|
||||
arch/x86/mm/tlb.c | 3 ++-
|
||||
3 files changed, 8 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlbbatch.h
|
||||
+++ b/arch/x86/include/asm/tlbbatch.h
|
||||
@@ -10,6 +10,11 @@ struct arch_tlbflush_unmap_batch {
|
||||
* the PFNs being flushed..
|
||||
*/
|
||||
struct cpumask cpumask;
|
||||
+ /*
|
||||
+ * Set if pages were unmapped from any MM, even one that does not
|
||||
+ * have active CPUs in its cpumask.
|
||||
+ */
|
||||
+ bool unmapped_pages;
|
||||
};
|
||||
|
||||
#endif /* _ARCH_X86_TLBBATCH_H */
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -353,6 +353,7 @@ static inline void arch_tlbbatch_add_pen
|
||||
{
|
||||
inc_mm_tlb_gen(mm);
|
||||
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
|
||||
+ batch->unmapped_pages = true;
|
||||
mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
|
||||
}
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1648,8 +1648,9 @@ void arch_tlbbatch_flush(struct arch_tlb
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
|
||||
invlpgb_flush_all_nonglobals();
|
||||
+ batch->unmapped_pages = false;
|
||||
} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
flush_tlb_multi(&batch->cpumask, info);
|
||||
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
@@ -0,0 +1,92 @@
|
||||
From 6ae491224973eb4013ee67a8c05c420f057d5fee Mon Sep 17 00:00:00 2001
|
||||
From: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Date: Thu, 8 May 2025 15:41:32 -0700
|
||||
Subject: x86/mm: Eliminate window where TLB flushes may be inadvertently
|
||||
skipped
|
||||
|
||||
tl;dr: There is a window in the mm switching code where the new CR3 is
|
||||
set and the CPU should be getting TLB flushes for the new mm. But
|
||||
should_flush_tlb() has a bug and suppresses the flush. Fix it by
|
||||
widening the window where should_flush_tlb() sends an IPI.
|
||||
|
||||
Long Version:
|
||||
|
||||
=== History ===
|
||||
|
||||
There were a few things leading up to this.
|
||||
|
||||
First, updating mm_cpumask() was observed to be too expensive, so it was
|
||||
made lazier. But being lazy caused too many unnecessary IPIs to CPUs
|
||||
due to the now-lazy mm_cpumask(). So code was added to cull
|
||||
mm_cpumask() periodically[2]. But that culling was a bit too aggressive
|
||||
and skipped sending TLB flushes to CPUs that need them. So here we are
|
||||
again.
|
||||
|
||||
=== Problem ===
|
||||
|
||||
The too-aggressive code in should_flush_tlb() strikes in this window:
|
||||
|
||||
// Turn on IPIs for this CPU/mm combination, but only
|
||||
// if should_flush_tlb() agrees:
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
load_new_mm_cr3(need_flush);
|
||||
// ^ After 'need_flush' is set to false, IPIs *MUST*
|
||||
// be sent to this CPU and not be ignored.
|
||||
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
||||
// ^ Not until this point does should_flush_tlb()
|
||||
// become true!
|
||||
|
||||
should_flush_tlb() will suppress TLB flushes between load_new_mm_cr3()
|
||||
and writing to 'loaded_mm', which is a window where they should not be
|
||||
suppressed. Whoops.
|
||||
|
||||
=== Solution ===
|
||||
|
||||
Thankfully, the fuzzy "just about to write CR3" window is already marked
|
||||
with loaded_mm==LOADED_MM_SWITCHING. Simply checking for that state in
|
||||
should_flush_tlb() is sufficient to ensure that the CPU is targeted with
|
||||
an IPI.
|
||||
|
||||
This will cause more TLB flush IPIs. But the window is relatively small
|
||||
and I do not expect this to cause any kind of measurable performance
|
||||
impact.
|
||||
|
||||
Update the comment where LOADED_MM_SWITCHING is written since it grew
|
||||
yet another user.
|
||||
|
||||
Peter Z also raised a concern that should_flush_tlb() might not observe
|
||||
'loaded_mm' and 'is_lazy' in the same order that switch_mm_irqs_off()
|
||||
writes them. Add a barrier to ensure that they are observed in the
|
||||
order they are written.
|
||||
|
||||
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Acked-by: Rik van Riel <riel@surriel.com>
|
||||
Link: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ [1]
|
||||
Fixes: 6db2526c1d69 ("x86/mm/tlb: Only trim the mm_cpumask once a second") [2]
|
||||
Reported-by: Stephen Dolan <sdolan@janestreet.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Acked-by: Ingo Molnar <mingo@kernel.org>
|
||||
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
---
|
||||
arch/x86/mm/tlb.c | 22 +++++++++++++++++++---
|
||||
1 file changed, 19 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -900,8 +900,9 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
cond_mitigation(tsk);
|
||||
|
||||
/*
|
||||
- * Let nmi_uaccess_okay() and finish_asid_transition()
|
||||
- * know that CR3 is changing.
|
||||
+ * Indicate that CR3 is about to change. nmi_uaccess_okay()
|
||||
+ * and others are sensitive to the window where mm_cpumask(),
|
||||
+ * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
|
||||
*/
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
||||
barrier();
|
@@ -1,33 +0,0 @@
|
||||
From 7b3f0f8d11f1b4319f593ba02d4dece890755dfa Mon Sep 17 00:00:00 2001
|
||||
From: Namjae Jeon <linkinjeon@kernel.org>
|
||||
Date: Wed, 30 Apr 2025 11:18:28 +0900
|
||||
Subject: ksmbd: prevent rename with empty string
|
||||
|
||||
Client can send empty newname string to ksmbd server.
|
||||
It will cause a kernel oops from d_alloc.
|
||||
This patch return the error when attempting to rename
|
||||
a file or directory with an empty new name string.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Reported-by: Norbert Szetei <norbert@doyensec.com>
|
||||
Tested-by: Norbert Szetei <norbert@doyensec.com>
|
||||
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
|
||||
Signed-off-by: Steve French <stfrench@microsoft.com>
|
||||
---
|
||||
fs/smb/server/smb2pdu.c | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
--- a/fs/smb/server/smb2pdu.c
|
||||
+++ b/fs/smb/server/smb2pdu.c
|
||||
@@ -633,6 +633,11 @@ smb2_get_name(const char *src, const int
|
||||
return name;
|
||||
}
|
||||
|
||||
+ if (*name == '\0') {
|
||||
+ kfree(name);
|
||||
+ return ERR_PTR(-EINVAL);
|
||||
+ }
|
||||
+
|
||||
if (*name == '\\') {
|
||||
pr_err("not allow directory name included leading slash\n");
|
||||
kfree(name);
|
35
debian/patches/patchset-pf/smb/0001-smb-client-fix-memory-leak-during-error-handling-for.patch
vendored
Normal file
35
debian/patches/patchset-pf/smb/0001-smb-client-fix-memory-leak-during-error-handling-for.patch
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
From 8ef14a884df5aaf48cf5f7ce6c91e7318cb07d4e Mon Sep 17 00:00:00 2001
|
||||
From: Jethro Donaldson <devel@jro.nz>
|
||||
Date: Thu, 15 May 2025 01:23:23 +1200
|
||||
Subject: smb: client: fix memory leak during error handling for POSIX mkdir
|
||||
|
||||
The response buffer for the CREATE request handled by smb311_posix_mkdir()
|
||||
is leaked on the error path (goto err_free_rsp_buf) because the structure
|
||||
pointer *rsp passed to free_rsp_buf() is not assigned until *after* the
|
||||
error condition is checked.
|
||||
|
||||
As *rsp is initialised to NULL, free_rsp_buf() becomes a no-op and the leak
|
||||
is instead reported by __kmem_cache_shutdown() upon subsequent rmmod of
|
||||
cifs.ko if (and only if) the error path has been hit.
|
||||
|
||||
Pass rsp_iov.iov_base to free_rsp_buf() instead, similar to the code in
|
||||
other functions in smb2pdu.c for which *rsp is assigned late.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Jethro Donaldson <devel@jro.nz>
|
||||
Signed-off-by: Steve French <stfrench@microsoft.com>
|
||||
---
|
||||
fs/smb/client/smb2pdu.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/smb/client/smb2pdu.c
|
||||
+++ b/fs/smb/client/smb2pdu.c
|
||||
@@ -2967,7 +2967,7 @@ replay_again:
|
||||
/* Eventually save off posix specific response info and timestamps */
|
||||
|
||||
err_free_rsp_buf:
|
||||
- free_rsp_buf(resp_buftype, rsp);
|
||||
+ free_rsp_buf(resp_buftype, rsp_iov.iov_base);
|
||||
kfree(pc_buf);
|
||||
err_free_req:
|
||||
cifs_small_buf_release(req);
|
@@ -1,37 +0,0 @@
|
||||
From fb87d390de327c76b11ed544de83771118f7b0c5 Mon Sep 17 00:00:00 2001
|
||||
From: Norbert Szetei <norbert@doyensec.com>
|
||||
Date: Fri, 2 May 2025 08:21:58 +0900
|
||||
Subject: ksmbd: prevent out-of-bounds stream writes by validating *pos
|
||||
|
||||
ksmbd_vfs_stream_write() did not validate whether the write offset
|
||||
(*pos) was within the bounds of the existing stream data length (v_len).
|
||||
If *pos was greater than or equal to v_len, this could lead to an
|
||||
out-of-bounds memory write.
|
||||
|
||||
This patch adds a check to ensure *pos is less than v_len before
|
||||
proceeding. If the condition fails, -EINVAL is returned.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Norbert Szetei <norbert@doyensec.com>
|
||||
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
|
||||
Signed-off-by: Steve French <stfrench@microsoft.com>
|
||||
---
|
||||
fs/smb/server/vfs.c | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
--- a/fs/smb/server/vfs.c
|
||||
+++ b/fs/smb/server/vfs.c
|
||||
@@ -443,6 +443,13 @@ static int ksmbd_vfs_stream_write(struct
|
||||
goto out;
|
||||
}
|
||||
|
||||
+ if (v_len <= *pos) {
|
||||
+ pr_err("stream write position %lld is out of bounds (stream length: %zd)\n",
|
||||
+ *pos, v_len);
|
||||
+ err = -EINVAL;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
if (v_len < size) {
|
||||
wbuf = kvzalloc(size, KSMBD_DEFAULT_GFP);
|
||||
if (!wbuf) {
|
@@ -1,74 +0,0 @@
|
||||
From 67ea573ce44aeac74e659879cdeb6ac39212d0b9 Mon Sep 17 00:00:00 2001
|
||||
From: Sean Heelan <seanheelan@gmail.com>
|
||||
Date: Tue, 6 May 2025 22:04:52 +0900
|
||||
Subject: ksmbd: Fix UAF in __close_file_table_ids
|
||||
|
||||
A use-after-free is possible if one thread destroys the file
|
||||
via __ksmbd_close_fd while another thread holds a reference to
|
||||
it. The existing checks on fp->refcount are not sufficient to
|
||||
prevent this.
|
||||
|
||||
The fix takes ft->lock around the section which removes the
|
||||
file from the file table. This prevents two threads acquiring the
|
||||
same file pointer via __close_file_table_ids, as well as the other
|
||||
functions which retrieve a file from the IDR and which already use
|
||||
this same lock.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Sean Heelan <seanheelan@gmail.com>
|
||||
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
|
||||
Signed-off-by: Steve French <stfrench@microsoft.com>
|
||||
---
|
||||
fs/smb/server/vfs_cache.c | 33 ++++++++++++++++++++++++++-------
|
||||
1 file changed, 26 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/fs/smb/server/vfs_cache.c
|
||||
+++ b/fs/smb/server/vfs_cache.c
|
||||
@@ -661,21 +661,40 @@ __close_file_table_ids(struct ksmbd_file
|
||||
bool (*skip)(struct ksmbd_tree_connect *tcon,
|
||||
struct ksmbd_file *fp))
|
||||
{
|
||||
- unsigned int id;
|
||||
- struct ksmbd_file *fp;
|
||||
- int num = 0;
|
||||
+ struct ksmbd_file *fp;
|
||||
+ unsigned int id = 0;
|
||||
+ int num = 0;
|
||||
|
||||
- idr_for_each_entry(ft->idr, fp, id) {
|
||||
- if (skip(tcon, fp))
|
||||
+ while (1) {
|
||||
+ write_lock(&ft->lock);
|
||||
+ fp = idr_get_next(ft->idr, &id);
|
||||
+ if (!fp) {
|
||||
+ write_unlock(&ft->lock);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ if (skip(tcon, fp) ||
|
||||
+ !atomic_dec_and_test(&fp->refcount)) {
|
||||
+ id++;
|
||||
+ write_unlock(&ft->lock);
|
||||
continue;
|
||||
+ }
|
||||
|
||||
set_close_state_blocked_works(fp);
|
||||
+ idr_remove(ft->idr, fp->volatile_id);
|
||||
+ fp->volatile_id = KSMBD_NO_FID;
|
||||
+ write_unlock(&ft->lock);
|
||||
+
|
||||
+ down_write(&fp->f_ci->m_lock);
|
||||
+ list_del_init(&fp->node);
|
||||
+ up_write(&fp->f_ci->m_lock);
|
||||
|
||||
- if (!atomic_dec_and_test(&fp->refcount))
|
||||
- continue;
|
||||
__ksmbd_close_fd(ft, fp);
|
||||
+
|
||||
num++;
|
||||
+ id++;
|
||||
}
|
||||
+
|
||||
return num;
|
||||
}
|
||||
|
Reference in New Issue
Block a user