1
0

release 6.14.7

This commit is contained in:
2025-05-18 12:32:18 +03:00
parent 5c06d65ded
commit a5a8a2798d
48 changed files with 215 additions and 737 deletions

View File

@@ -0,0 +1,149 @@
From 2ffeb0d8d193c35403cea13d3b7273b523631007 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 13 Feb 2025 11:13:52 -0500
Subject: x86/mm: Make MMU_GATHER_RCU_TABLE_FREE unconditional
Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
paravirt, and not when running on bare metal.
There is no real good reason to do things differently for
each setup. Make them all the same.
Currently get_user_pages_fast synchronizes against page table
freeing in two different ways:
- on bare metal, by blocking IRQs, which block TLB flush IPIs
- on paravirt, with MMU_GATHER_RCU_TABLE_FREE
This is done because some paravirt TLB flush implementations
handle the TLB flush in the hypervisor, and will do the flush
even when the target CPU has interrupts disabled.
Always handle page table freeing with MMU_GATHER_RCU_TABLE_FREE.
Using RCU synchronization between page table freeing and get_user_pages_fast()
allows bare metal to also do TLB flushing while interrupts are disabled.
Various places in the mm do still block IRQs or disable preemption
as an implicit way to block RCU frees.
That makes it safe to use INVLPGB on AMD CPUs.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
Tested-by: Brendan Jackman <jackmanb@google.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Link: https://lore.kernel.org/r/20250213161423.449435-2-riel@surriel.com
---
arch/x86/Kconfig | 2 +-
arch/x86/kernel/paravirt.c | 17 +----------------
arch/x86/mm/pgtable.c | 27 ++++-----------------------
3 files changed, 6 insertions(+), 40 deletions(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -277,7 +277,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
+ select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -59,21 +59,6 @@ void __init native_pv_lock_init(void)
static_branch_enable(&virt_spin_lock_key);
}
-#ifndef CONFIG_PT_RECLAIM
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct ptdesc *ptdesc = (struct ptdesc *)table;
-
- pagetable_dtor(ptdesc);
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_table(tlb, table);
-}
-#endif
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -197,7 +182,7 @@ struct paravirt_patch_template pv_ops =
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table = native_tlb_remove_table,
+ .mmu.tlb_remove_table = tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -18,25 +18,6 @@ EXPORT_SYMBOL(physical_mask);
#define PGTABLE_HIGHMEM 0
#endif
-#ifndef CONFIG_PARAVIRT
-#ifndef CONFIG_PT_RECLAIM
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct ptdesc *ptdesc = (struct ptdesc *)table;
-
- pagetable_dtor(ptdesc);
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_table(tlb, table);
-}
-#endif /* !CONFIG_PT_RECLAIM */
-#endif /* !CONFIG_PARAVIRT */
-
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
pgtable_t pte_alloc_one(struct mm_struct *mm)
@@ -64,7 +45,7 @@ early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
paravirt_release_pte(page_to_pfn(pte));
- paravirt_tlb_remove_table(tlb, page_ptdesc(pte));
+ tlb_remove_table(tlb, page_ptdesc(pte));
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -78,21 +59,21 @@ void ___pmd_free_tlb(struct mmu_gather *
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd));
+ tlb_remove_table(tlb, virt_to_ptdesc(pmd));
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud));
+ tlb_remove_table(tlb, virt_to_ptdesc(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d));
+ tlb_remove_table(tlb, virt_to_ptdesc(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */

View File

@@ -0,0 +1,89 @@
From aadea0887cca5739137f109eab0e1b38604c8af8 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 13 Feb 2025 11:13:53 -0500
Subject: x86/mm: Remove pv_ops.mmu.tlb_remove_table call
Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
Get rid of the indirection by simply calling tlb_remove_table directly,
and not going through the paravirt function pointers.
Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
Tested-by: Brendan Jackman <jackmanb@google.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Link: https://lore.kernel.org/r/20250213161423.449435-3-riel@surriel.com
---
arch/x86/hyperv/mmu.c | 1 -
arch/x86/include/asm/paravirt.h | 5 -----
arch/x86/include/asm/paravirt_types.h | 2 --
arch/x86/kernel/kvm.c | 1 -
arch/x86/kernel/paravirt.c | 1 -
arch/x86/xen/mmu_pv.c | 1 -
6 files changed, 11 deletions(-)
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void)
pr_info("Using hypercall for remote TLB flush\n");
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
}
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
}
-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
-}
-
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
PVOP_VCALL1(mmu.exit_mmap, mm);
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -133,8 +133,6 @@ struct pv_mmu_ops {
void (*flush_tlb_multi)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
-
/* Hook for intercepting the destruction of an mm_struct. */
void (*exit_mmap)(struct mm_struct *mm);
void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
if (pv_tlb_flush_supported()) {
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
pr_info("KVM setup pv remote TLB flush\n");
}
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -182,7 +182,6 @@ struct paravirt_patch_template pv_ops =
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table = tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_one_user = xen_flush_tlb_one_user,
.flush_tlb_multi = xen_flush_tlb_multi,
- .tlb_remove_table = tlb_remove_table,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,

View File

@@ -0,0 +1,87 @@
From 170f37d1499a28f7a1902e007111867c7cf0147f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:36 -0500
Subject: x86/mm: Consolidate full flush threshold decision
Reduce code duplication by consolidating the decision point for whether to do
individual invalidations or a full flush inside get_flush_tlb_info().
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lore.kernel.org/r/20250226030129.530345-2-riel@surriel.com
---
arch/x86/mm/tlb.c | 41 +++++++++++++++++++----------------------
1 file changed, 19 insertions(+), 22 deletions(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1019,6 +1019,15 @@ static struct flush_tlb_info *get_flush_
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
#endif
+ /*
+ * If the number of flushes is so large that a full flush
+ * would be faster, do a full flush.
+ */
+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+ start = 0;
+ end = TLB_FLUSH_ALL;
+ }
+
info->start = start;
info->end = end;
info->mm = mm;
@@ -1045,17 +1054,8 @@ void flush_tlb_mm_range(struct mm_struct
bool freed_tables)
{
struct flush_tlb_info *info;
+ int cpu = get_cpu();
u64 new_tlb_gen;
- int cpu;
-
- cpu = get_cpu();
-
- /* Should we flush just the requested range? */
- if ((end == TLB_FLUSH_ALL) ||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
- start = 0;
- end = TLB_FLUSH_ALL;
- }
/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1108,22 +1108,19 @@ static void do_kernel_range_flush(void *
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- /* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
- on_each_cpu(do_flush_tlb_all, NULL, 1);
- } else {
- struct flush_tlb_info *info;
+ struct flush_tlb_info *info;
- preempt_disable();
- info = get_flush_tlb_info(NULL, start, end, 0, false,
- TLB_GENERATION_INVALID);
+ guard(preempt)();
+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+ TLB_GENERATION_INVALID);
+
+ if (info->end == TLB_FLUSH_ALL)
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ else
on_each_cpu(do_kernel_range_flush, info, 1);
- put_flush_tlb_info();
- preempt_enable();
- }
+ put_flush_tlb_info();
}
/*

View File

@@ -0,0 +1,103 @@
From acb5a284db4fa3dbbb246ab8fa58da0143cd68ce Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:37 -0500
Subject: x86/mm: Add INVLPGB feature and Kconfig entry
In addition, the CPU advertises the maximum number of pages that can be
shot down with one INVLPGB instruction in CPUID. Save that information
for later use.
[ bp: use cpu_has(), typos, massage. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-3-riel@surriel.com
---
arch/x86/Kconfig.cpu | 4 ++++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/disabled-features.h | 8 +++++++-
arch/x86/include/asm/tlbflush.h | 3 +++
arch/x86/kernel/cpu/amd.c | 6 ++++++
5 files changed, 21 insertions(+), 1 deletion(-)
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -740,6 +740,10 @@ menuconfig PROCESSOR_SELECT
This lets you choose what x86 vendor support code your kernel
will include.
+config BROADCAST_TLB_FLUSH
+ def_bool y
+ depends on CPU_SUP_AMD && 64BIT
+
config CPU_SUP_INTEL
default y
bool "Support Intel processors" if PROCESSOR_SELECT
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -338,6 +338,7 @@
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -129,6 +129,12 @@
#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31))
#endif
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+#define DISABLE_INVLPGB 0
+#else
+#define DISABLE_INVLPGB (1 << (X86_FEATURE_INVLPGB & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -146,7 +152,7 @@
#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM)
-#define DISABLED_MASK13 0
+#define DISABLED_MASK13 (DISABLE_INVLPGB)
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -183,6 +183,9 @@ static inline void cr4_init_shadow(void)
extern unsigned long mmu_cr4_features;
extern u32 *trampoline_cr4_features;
+/* How many pages can be invalidated with one INVLPGB. */
+extern u16 invlpgb_count_max;
+
extern void initialize_tlbstate_and_flush(void);
/*
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -29,6 +29,8 @@
#include "cpu.h"
+u16 invlpgb_count_max __ro_after_init;
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -1145,6 +1147,10 @@ static void cpu_detect_tlb_amd(struct cp
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
}
static const struct cpu_dev amd_cpu_dev = {

View File

@@ -0,0 +1,170 @@
From 27bab4a6ed6ee7b7b0e2d216b8802800ef26b2ad Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Fri, 28 Feb 2025 20:32:30 +0100
Subject: x86/mm: Add INVLPGB support code
Add helper functions and definitions needed to use broadcast TLB
invalidation on AMD CPUs.
[ bp:
- Cleanup commit message
- Improve and expand comments
- push the preemption guards inside the invlpgb* helpers
- merge improvements from dhansen
- add !CONFIG_BROADCAST_TLB_FLUSH function stubs because Clang
can't do DCE properly yet and looks at the inline asm and
complains about it getting a u64 argument on 32-bit code ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-4-riel@surriel.com
---
arch/x86/include/asm/tlb.h | 132 +++++++++++++++++++++++++++++++++++++
1 file changed, 132 insertions(+)
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -6,6 +6,9 @@
static inline void tlb_flush(struct mmu_gather *tlb);
#include <asm-generic/tlb.h>
+#include <linux/kernel.h>
+#include <vdso/bits.h>
+#include <vdso/page.h>
static inline void tlb_flush(struct mmu_gather *tlb)
{
@@ -25,4 +28,133 @@ static inline void invlpg(unsigned long
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
}
+enum addr_stride {
+ PTE_STRIDE = 0,
+ PMD_STRIDE = 1
+};
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+/*
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+ *
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+ * be done in a parallel fashion.
+ *
+ * The instruction takes the number of extra pages to invalidate, beyond
+ * the first page, while __invlpgb gets the more human readable number of
+ * pages to invalidate.
+ *
+ * The bits in rax[0:2] determine respectively which components of the address
+ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
+ * address in the specified range matches.
+ *
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+ * this CPU have completed.
+ */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 nr_pages,
+ enum addr_stride stride, u8 flags)
+{
+ u32 edx = (pcid << 16) | asid;
+ u32 ecx = (stride << 31) | (nr_pages - 1);
+ u64 rax = addr | flags;
+
+ /* The low bits in rax are for flags. Verify addr is clean. */
+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
+
+ /* INVLPGB; supported in binutils >= 2.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx));
+}
+
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags)
+{
+ __invlpgb(asid, pcid, 0, 1, 0, flags);
+}
+
+static inline void __tlbsync(void)
+{
+ /*
+ * TLBSYNC waits for INVLPGB instructions originating on the same CPU
+ * to have completed. Print a warning if the task has been migrated,
+ * and might not be waiting on all the INVLPGBs issued during this TLB
+ * invalidation sequence.
+ */
+ cant_migrate();
+
+ /* TLBSYNC: supported in binutils >= 0.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
+}
+#else
+/* Some compilers (I'm looking at you clang!) simply can't do DCE */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 nr_pages,
+ enum addr_stride s, u8 flags) { }
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { }
+static inline void __tlbsync(void) { }
+#endif
+
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
+ *
+ * The first is used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_FLAG_VA BIT(0)
+#define INVLPGB_FLAG_PCID BIT(1)
+#define INVLPGB_FLAG_ASID BIT(2)
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
+
+/* The implied mode when all bits are clear: */
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ unsigned long addr,
+ u16 nr, bool stride)
+{
+ enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE;
+ u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA;
+
+ __invlpgb(0, pcid, addr, nr, str, flags);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+ __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invlpgb_flush_all(void)
+{
+ /*
+ * TLBSYNC at the end needs to make sure all flushes done on the
+ * current CPU have been executed system-wide. Therefore, make
+ * sure nothing gets migrated in-between but disable preemption
+ * as it is cheaper.
+ */
+ guard(preempt)();
+ __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL);
+ __tlbsync();
+}
+
+/* Flush addr, including globals, for all PCIDs. */
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+ __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invlpgb_flush_all_nonglobals(void)
+{
+ guard(preempt)();
+ __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS);
+ __tlbsync();
+}
#endif /* _ASM_X86_TLB_H */

View File

@@ -0,0 +1,97 @@
From 358d71638f420efe8f7e05ce74aefe13e9320283 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:39 -0500
Subject: x86/mm: Use INVLPGB for kernel TLB flushes
Use broadcast TLB invalidation for kernel addresses when available.
Remove the need to send IPIs for kernel TLB flushes.
[ bp: Integrate dhansen's comments additions, merge the
flush_tlb_all() change into this one too. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-5-riel@surriel.com
---
arch/x86/mm/tlb.c | 48 +++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 44 insertions(+), 4 deletions(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1083,7 +1083,6 @@ void flush_tlb_mm_range(struct mm_struct
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
-
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -1093,7 +1092,32 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+
+ /* First try (faster) hardware-assisted TLB invalidation. */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_flush_all();
+ else
+ /* Fall back to the IPI-based invalidation. */
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+/* Flush an arbitrarily large range of memory with INVLPGB. */
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
+{
+ unsigned long addr, nr;
+
+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+ nr = (info->end - addr) >> PAGE_SHIFT;
+
+ /*
+ * INVLPGB has a limit on the size of ranges it can
+ * flush. Break up large flushes.
+ */
+ nr = clamp_val(nr, 1, invlpgb_count_max);
+
+ invlpgb_flush_addr_nosync(addr, nr);
+ }
+ __tlbsync();
}
static void do_kernel_range_flush(void *info)
@@ -1106,6 +1130,22 @@ static void do_kernel_range_flush(void *
flush_tlb_one_kernel(addr);
}
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_flush_all();
+ else
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_kernel_range_flush(info);
+ else
+ on_each_cpu(do_kernel_range_flush, info, 1);
+}
+
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
struct flush_tlb_info *info;
@@ -1116,9 +1156,9 @@ void flush_tlb_kernel_range(unsigned lon
TLB_GENERATION_INVALID);
if (info->end == TLB_FLUSH_ALL)
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+ kernel_tlb_flush_all(info);
else
- on_each_cpu(do_kernel_range_flush, info, 1);
+ kernel_tlb_flush_range(info);
put_flush_tlb_info();
}

View File

@@ -0,0 +1,32 @@
From 7cf099de79e12d6c4949f733c8cbb241bb08f07a Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:41 -0500
Subject: x86/mm: Use broadcast TLB flushing in page reclaim
Page reclaim tracks only the CPU(s) where the TLB needs to be flushed, rather
than all the individual mappings that may be getting invalidated.
Use broadcast TLB flushing when that is available.
[ bp: Massage commit message. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-7-riel@surriel.com
---
arch/x86/mm/tlb.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1339,7 +1339,9 @@ void arch_tlbbatch_flush(struct arch_tlb
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ invlpgb_flush_all_nonglobals();
+ } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();

View File

@@ -0,0 +1,286 @@
From f9ecaaca7ac26789d7d3e0d8022b7c99599dc8a3 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:42 -0500
Subject: x86/mm: Add global ASID allocation helper functions
Add functions to manage global ASID space. Multithreaded processes that are
simultaneously active on 4 or more CPUs can get a global ASID, resulting in the
same PCID being used for that process on every CPU.
This in turn will allow the kernel to use hardware-assisted TLB flushing
through AMD INVLPGB or Intel RAR for these processes.
[ bp:
- Extend use_global_asid() comment
- s/X86_BROADCAST_TLB_FLUSH/BROADCAST_TLB_FLUSH/g
- other touchups ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-8-riel@surriel.com
---
arch/x86/include/asm/mmu.h | 12 +++
arch/x86/include/asm/mmu_context.h | 2 +
arch/x86/include/asm/tlbflush.h | 37 +++++++
arch/x86/mm/tlb.c | 154 ++++++++++++++++++++++++++++-
4 files changed, 202 insertions(+), 3 deletions(-)
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -69,6 +69,18 @@ typedef struct {
u16 pkey_allocation_map;
s16 execute_only_pkey;
#endif
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+ /*
+ * The global ASID will be a non-zero value when the process has
+ * the same ASID across all CPUs, allowing it to make use of
+ * hardware-assisted remote TLB invalidation like AMD INVLPGB.
+ */
+ u16 global_asid;
+
+ /* The process is transitioning to a new global ASID number. */
+ bool asid_transition;
+#endif
} mm_context_t;
#define INIT_MM_CONTEXT(mm) \
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+extern void mm_free_global_asid(struct mm_struct *mm);
+
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -6,6 +6,7 @@
#include <linux/mmu_notifier.h>
#include <linux/sched.h>
+#include <asm/barrier.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
@@ -234,6 +235,42 @@ void flush_tlb_one_kernel(unsigned long
void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
+static inline bool is_dyn_asid(u16 asid)
+{
+ return asid < TLB_NR_DYN_ASIDS;
+}
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return 0;
+
+ asid = smp_load_acquire(&mm->context.global_asid);
+
+ /* mm->context.global_asid is either 0, or a global ASID */
+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
+
+ return asid;
+}
+
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
+{
+ /*
+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
+ * finish_asid_transition() needs to observe asid_transition = true
+ * once it observes global_asid.
+ */
+ mm->context.asid_transition = true;
+ smp_store_release(&mm->context.global_asid, asid);
+}
+#else
+static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+#endif /* CONFIG_BROADCAST_TLB_FLUSH */
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -74,13 +74,15 @@
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
- * the canonical identifier for an mm
+ * the canonical identifier for an mm, dynamically allocated on each CPU
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
+ * the canonical, global identifier for an mm, identical across all CPUs
*
- * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * kPCID - [1, MAX_ASID_AVAILABLE]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
@@ -252,6 +254,152 @@ static void choose_new_asid(struct mm_st
}
/*
+ * Global ASIDs are allocated for multi-threaded processes that are
+ * active on multiple CPUs simultaneously, giving each of those
+ * processes the same PCID on every CPU, for use with hardware-assisted
+ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
+ *
+ * These global ASIDs are held for the lifetime of the process.
+ */
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
+
+/*
+ * When the search for a free ASID in the global ASID space reaches
+ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
+ * freed global ASIDs are safe to re-use.
+ *
+ * This way the global flush only needs to happen at ASID rollover
+ * time, and not at ASID allocation time.
+ */
+static void reset_global_asid_space(void)
+{
+ lockdep_assert_held(&global_asid_lock);
+
+ invlpgb_flush_all_nonglobals();
+
+ /*
+ * The TLB flush above makes it safe to re-use the previously
+ * freed global ASIDs.
+ */
+ bitmap_andnot(global_asid_used, global_asid_used,
+ global_asid_freed, MAX_ASID_AVAILABLE);
+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
+
+ /* Restart the search from the start of global ASID space. */
+ last_global_asid = TLB_NR_DYN_ASIDS;
+}
+
+static u16 allocate_global_asid(void)
+{
+ u16 asid;
+
+ lockdep_assert_held(&global_asid_lock);
+
+ /* The previous allocation hit the edge of available address space */
+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
+ reset_global_asid_space();
+
+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
+
+ if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
+ /* This should never happen. */
+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
+ global_asid_available);
+ return 0;
+ }
+
+ /* Claim this global ASID. */
+ __set_bit(asid, global_asid_used);
+ last_global_asid = asid;
+ global_asid_available--;
+ return asid;
+}
+
+/*
+ * Check whether a process is currently active on more than @threshold CPUs.
+ * This is a cheap estimation on whether or not it may make sense to assign
+ * a global ASID to this process, and use broadcast TLB invalidation.
+ */
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
+{
+ int count = 0;
+ int cpu;
+
+ /* This quick check should eliminate most single threaded programs. */
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
+ return false;
+
+ /* Slower check to make sure. */
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /* Skip the CPUs that aren't really running this process. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
+ continue;
+
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ continue;
+
+ if (++count > threshold)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Assign a global ASID to the current process, protecting against
+ * races between multiple threads in the process.
+ */
+static void use_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* This process is already using broadcast TLB invalidation. */
+ if (mm_global_asid(mm))
+ return;
+
+ /*
+ * The last global ASID was consumed while waiting for the lock.
+ *
+ * If this fires, a more aggressive ASID reuse scheme might be
+ * needed.
+ */
+ if (!global_asid_available) {
+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
+ return;
+ }
+
+ asid = allocate_global_asid();
+ if (!asid)
+ return;
+
+ mm_assign_global_asid(mm, asid);
+}
+
+void mm_free_global_asid(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return;
+
+ if (!mm_global_asid(mm))
+ return;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* The global ASID can be re-used only after flush at wrap-around. */
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+ __set_bit(mm->context.global_asid, global_asid_freed);
+
+ mm->context.global_asid = 0;
+ global_asid_available++;
+#endif
+}
+
+/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*

View File

@@ -0,0 +1,219 @@
From b56070b9f121507cabe352e03f0c534db2d5adc7 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:43 -0500
Subject: x86/mm: Handle global ASID context switch and TLB flush
Do context switch and TLB flush support for processes that use a global
ASID and PCID across all CPUs.
At both context switch time and TLB flush time, it needs to be checked whether
a task is switching to a global ASID, and, if so, reload the TLB with the new
ASID as appropriate.
In both code paths, the TLB flush is avoided if a global ASID is used, because
the global ASIDs are always kept up to date across CPUs, even when the
process is not running on a CPU.
[ bp:
- Massage
- :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-9-riel@surriel.com
---
arch/x86/include/asm/tlbflush.h | 14 ++++++
arch/x86/mm/tlb.c | 77 ++++++++++++++++++++++++++++++---
2 files changed, 84 insertions(+), 7 deletions(-)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -240,6 +240,11 @@ static inline bool is_dyn_asid(u16 asid)
return asid < TLB_NR_DYN_ASIDS;
}
+static inline bool is_global_asid(u16 asid)
+{
+ return !is_dyn_asid(asid);
+}
+
#ifdef CONFIG_BROADCAST_TLB_FLUSH
static inline u16 mm_global_asid(struct mm_struct *mm)
{
@@ -266,9 +271,18 @@ static inline void mm_assign_global_asid
mm->context.asid_transition = true;
smp_store_release(&mm->context.global_asid, asid);
}
+
+static inline bool mm_in_asid_transition(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ return mm && READ_ONCE(mm->context.asid_transition);
+}
#else
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
#ifdef CONFIG_PARAVIRT
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -227,6 +227,20 @@ static void choose_new_asid(struct mm_st
return;
}
+ /*
+ * TLB consistency for global ASIDs is maintained with hardware assisted
+ * remote TLB flushing. Global ASIDs are always up to date.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ u16 global_asid = mm_global_asid(next);
+
+ if (global_asid) {
+ *new_asid = global_asid;
+ *need_flush = false;
+ return;
+ }
+ }
+
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
@@ -400,6 +414,23 @@ void mm_free_global_asid(struct mm_struc
}
/*
+ * Is the mm transitioning from a CPU-local ASID to a global ASID?
+ */
+static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
+{
+ u16 global_asid = mm_global_asid(mm);
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ /* Process is transitioning to a global ASID */
+ if (global_asid && asid != global_asid)
+ return true;
+
+ return false;
+}
+
+/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
@@ -704,7 +735,8 @@ void switch_mm_irqs_off(struct mm_struct
*/
if (prev == next) {
/* Not actually switching mm's */
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ VM_WARN_ON(is_dyn_asid(prev_asid) &&
+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
/*
@@ -721,6 +753,20 @@ void switch_mm_irqs_off(struct mm_struct
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
+ /* Check if the current mm is transitioning to a global ASID */
+ if (mm_needs_global_asid(next, prev_asid)) {
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ goto reload_tlb;
+ }
+
+ /*
+ * Broadcast TLB invalidation keeps this ASID up to date
+ * all the time.
+ */
+ if (is_global_asid(prev_asid))
+ return;
+
/*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
@@ -755,6 +801,13 @@ void switch_mm_irqs_off(struct mm_struct
cond_mitigation(tsk);
/*
+ * Let nmi_uaccess_okay() and finish_asid_transition()
+ * know that CR3 is changing.
+ */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+
+ /*
* Leave this CPU in prev's mm_cpumask. Atomic writes to
* mm_cpumask can be expensive under contention. The CPU
* will be removed lazily at TLB flush time.
@@ -768,18 +821,12 @@ void switch_mm_irqs_off(struct mm_struct
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
-
- /*
- * Indicate that CR3 is about to change. nmi_uaccess_okay()
- * and others are sensitive to the window where mm_cpumask(),
- * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
- */
- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
- barrier();
}
+reload_tlb:
new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
+ VM_WARN_ON_ONCE(is_global_asid(new_asid));
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
@@ -898,7 +945,7 @@ static void flush_tlb_func(void *info)
const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ u64 local_tlb_gen;
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
u64 mm_tlb_gen;
@@ -921,6 +968,16 @@ static void flush_tlb_func(void *info)
if (unlikely(loaded_mm == &init_mm))
return;
+ /* Reload the ASID if transitioning into or out of a global ASID */
+ if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ }
+
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
+ if (is_global_asid(loaded_mm_asid))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
@@ -938,6 +995,8 @@ static void flush_tlb_func(void *info)
return;
}
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) {
/*
@@ -1120,7 +1179,7 @@ STATIC_NOPV void native_flush_tlb_multi(
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables)
+ if (info->freed_tables || mm_in_asid_transition(info->mm))
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,

View File

@@ -0,0 +1,88 @@
From 6d3b8545e2c3c638363fb449a99b5a6cbab87a49 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:44 -0500
Subject: x86/mm: Add global ASID process exit helpers
A global ASID is allocated for the lifetime of a process. Free the global ASID
at process exit time.
[ bp: Massage, create helpers, hide details inside them. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-10-riel@surriel.com
---
arch/x86/include/asm/mmu_context.h | 8 +++++++-
arch/x86/include/asm/tlbflush.h | 9 +++++++++
2 files changed, 16 insertions(+), 1 deletion(-)
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -2,7 +2,6 @@
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H
-#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>
@@ -13,6 +12,7 @@
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/gsseg.h>
+#include <asm/desc.h>
extern atomic64_t last_mm_ctx_id;
@@ -139,6 +139,9 @@ static inline void mm_reset_untag_mask(s
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+#define mm_init_global_asid mm_init_global_asid
+extern void mm_init_global_asid(struct mm_struct *mm);
+
extern void mm_free_global_asid(struct mm_struct *mm);
/*
@@ -163,6 +166,8 @@ static inline int init_new_context(struc
mm->context.execute_only_pkey = -1;
}
#endif
+
+ mm_init_global_asid(mm);
mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
@@ -172,6 +177,7 @@ static inline int init_new_context(struc
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
+ mm_free_global_asid(mm);
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,14 @@ static inline u16 mm_global_asid(struct
return asid;
}
+static inline void mm_init_global_asid(struct mm_struct *mm)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ mm->context.global_asid = 0;
+ mm->context.asid_transition = false;
+ }
+}
+
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
{
/*
@@ -281,6 +289,7 @@ static inline bool mm_in_asid_transition
}
#else
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+static inline void mm_init_global_asid(struct mm_struct *mm) { }
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
#endif /* CONFIG_BROADCAST_TLB_FLUSH */

View File

@@ -0,0 +1,219 @@
From 077e9ceb65f514ea63afc65cce86ce8677e77012 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:45 -0500
Subject: x86/mm: Enable broadcast TLB invalidation for multi-threaded
processes
There is not enough room in the 12-bit ASID address space to hand out
broadcast ASIDs to every process. Only hand out broadcast ASIDs to processes
when they are observed to be simultaneously running on 4 or more CPUs.
This also allows single threaded process to continue using the cheaper, local
TLB invalidation instructions like INVLPGB.
Due to the structure of flush_tlb_mm_range(), the INVLPGB flushing is done in
a generically named broadcast_tlb_flush() function which can later also be
used for Intel RAR.
Combined with the removal of unnecessary lru_add_drain calls() (see
https://lore.kernel.org/r/20241219153253.3da9e8aa@fangorn) this results in
a nice performance boost for the will-it-scale tlb_flush2_threads test on an
AMD Milan system with 36 cores:
- vanilla kernel: 527k loops/second
- lru_add_drain removal: 731k loops/second
- only INVLPGB: 527k loops/second
- lru_add_drain + INVLPGB: 1157k loops/second
Profiling with only the INVLPGB changes showed while TLB invalidation went
down from 40% of the total CPU time to only around 4% of CPU time, the
contention simply moved to the LRU lock.
Fixing both at the same time about doubles the number of iterations per second
from this case.
Comparing will-it-scale tlb_flush2_threads with several different numbers of
threads on a 72 CPU AMD Milan shows similar results. The number represents the
total number of loops per second across all the threads:
threads tip INVLPGB
1 315k 304k
2 423k 424k
4 644k 1032k
8 652k 1267k
16 737k 1368k
32 759k 1199k
64 636k 1094k
72 609k 993k
1 and 2 thread performance is similar with and without INVLPGB, because
INVLPGB is only used on processes using 4 or more CPUs simultaneously.
The number is the median across 5 runs.
Some numbers closer to real world performance can be found at Phoronix, thanks
to Michael:
https://www.phoronix.com/news/AMD-INVLPGB-Linux-Benefits
[ bp:
- Massage
- :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
- :%s/\<clear_asid_transition\>/mm_clear_asid_transition/cgi
- Fold in a 0day bot fix: https://lore.kernel.org/oe-kbuild-all/202503040000.GtiWUsBm-lkp@intel.com
]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Link: https://lore.kernel.org/r/20250226030129.530345-11-riel@surriel.com
---
arch/x86/include/asm/tlbflush.h | 6 ++
arch/x86/mm/tlb.c | 104 +++++++++++++++++++++++++++++++-
2 files changed, 109 insertions(+), 1 deletion(-)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -280,6 +280,11 @@ static inline void mm_assign_global_asid
smp_store_release(&mm->context.global_asid, asid);
}
+static inline void mm_clear_asid_transition(struct mm_struct *mm)
+{
+ WRITE_ONCE(mm->context.asid_transition, false);
+}
+
static inline bool mm_in_asid_transition(struct mm_struct *mm)
{
if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
@@ -291,6 +296,7 @@ static inline bool mm_in_asid_transition
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
static inline void mm_init_global_asid(struct mm_struct *mm) { }
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -431,6 +431,105 @@ static bool mm_needs_global_asid(struct
}
/*
+ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
+ * systems have over 8k CPUs. Because of this potential ASID shortage,
+ * global ASIDs are handed out to processes that have frequent TLB
+ * flushes and are active on 4 or more CPUs simultaneously.
+ */
+static void consider_global_asid(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return;
+
+ /* Check every once in a while. */
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
+ return;
+
+ /*
+ * Assign a global ASID if the process is active on
+ * 4 or more CPUs simultaneously.
+ */
+ if (mm_active_cpus_exceeds(mm, 3))
+ use_global_asid(mm);
+}
+
+static void finish_asid_transition(struct flush_tlb_info *info)
+{
+ struct mm_struct *mm = info->mm;
+ int bc_asid = mm_global_asid(mm);
+ int cpu;
+
+ if (!mm_in_asid_transition(mm))
+ return;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /*
+ * The remote CPU is context switching. Wait for that to
+ * finish, to catch the unlikely case of it switching to
+ * the target mm with an out of date ASID.
+ */
+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
+ cpu_relax();
+
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
+ continue;
+
+ /*
+ * If at least one CPU is not using the global ASID yet,
+ * send a TLB flush IPI. The IPI should cause stragglers
+ * to transition soon.
+ *
+ * This can race with the CPU switching to another task;
+ * that results in a (harmless) extra IPI.
+ */
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
+ flush_tlb_multi(mm_cpumask(info->mm), info);
+ return;
+ }
+ }
+
+ /* All the CPUs running this process are using the global ASID. */
+ mm_clear_asid_transition(mm);
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ bool pmd = info->stride_shift == PMD_SHIFT;
+ unsigned long asid = mm_global_asid(info->mm);
+ unsigned long addr = info->start;
+
+ /*
+ * TLB flushes with INVLPGB are kicked off asynchronously.
+ * The inc_mm_tlb_gen() guarantees page table updates are done
+ * before these TLB flushes happen.
+ */
+ if (info->end == TLB_FLUSH_ALL) {
+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ invlpgb_flush_single_pcid_nosync(user_pcid(asid));
+ } else do {
+ unsigned long nr = 1;
+
+ if (info->stride_shift <= PMD_SHIFT) {
+ nr = (info->end - addr) >> info->stride_shift;
+ nr = clamp_val(nr, 1, invlpgb_count_max);
+ }
+
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+
+ addr += nr << info->stride_shift;
+ } while (addr < info->end);
+
+ finish_asid_transition(info);
+
+ /* Wait for the INVLPGBs kicked off above to finish. */
+ __tlbsync();
+}
+
+/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
@@ -1275,9 +1374,12 @@ void flush_tlb_mm_range(struct mm_struct
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ if (mm_global_asid(mm)) {
+ broadcast_tlb_flush(info);
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
+ consider_global_asid(mm);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();

View File

@@ -0,0 +1,83 @@
From 1994cff363a37aff5b1232ca9f757b02ae244956 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:47 -0500
Subject: x86/mm: Enable AMD translation cache extensions
With AMD TCE (translation cache extensions) only the intermediate mappings
that cover the address range zapped by INVLPG / INVLPGB get invalidated,
rather than all intermediate mappings getting zapped at every TLB invalidation.
This can help reduce the TLB miss rate, by keeping more intermediate mappings
in the cache.
From the AMD manual:
Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit to
1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on TLB
entries. When this bit is 0, these instructions remove the target PTE from the
TLB as well as all upper-level table entries that are cached in the TLB,
whether or not they are associated with the target PTE. When this bit is set,
these instructions will remove the target PTE and only those upper-level
entries that lead to the target PTE in the page table hierarchy, leaving
unrelated upper-level entries intact.
[ bp: use cpu_has()... I know, it is a mess. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-13-riel@surriel.com
---
arch/x86/include/asm/msr-index.h | 2 ++
arch/x86/kernel/cpu/amd.c | 4 ++++
tools/arch/x86/include/asm/msr-index.h | 2 ++
3 files changed, 8 insertions(+)
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -25,6 +25,7 @@
#define _EFER_SVME 12 /* Enable virtualization */
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
#define EFER_SCE (1<<_EFER_SCE)
@@ -34,6 +35,7 @@
#define EFER_SVME (1<<_EFER_SVME)
#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
+#define EFER_TCE (1<<_EFER_TCE)
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
/*
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1081,6 +1081,10 @@ static void init_amd(struct cpuinfo_x86
/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+ /* Enable Translation Cache Extension */
+ if (cpu_has(c, X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}
#ifdef CONFIG_X86_32
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -25,6 +25,7 @@
#define _EFER_SVME 12 /* Enable virtualization */
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
#define EFER_SCE (1<<_EFER_SCE)
@@ -34,6 +35,7 @@
#define EFER_SVME (1<<_EFER_SVME)
#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
+#define EFER_TCE (1<<_EFER_TCE)
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
/*

View File

@@ -0,0 +1,121 @@
From 5932a2c8122050c4a2f71588778feb0677fe32b4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 4 Mar 2025 12:59:56 +0100
Subject: x86/mm: Always set the ASID valid bit for the INVLPGB instruction
When executing the INVLPGB instruction on a bare-metal host or hypervisor, if
the ASID valid bit is not set, the instruction will flush the TLB entries that
match the specified criteria for any ASID, not just the those of the host. If
virtual machines are running on the system, this may result in inadvertent
flushes of guest TLB entries.
When executing the INVLPGB instruction in a guest and the INVLPGB instruction is
not intercepted by the hypervisor, the hardware will replace the requested ASID
with the guest ASID and set the ASID valid bit before doing the broadcast
invalidation. Thus a guest is only able to flush its own TLB entries.
So to limit the host TLB flushing reach, always set the ASID valid bit using an
ASID value of 0 (which represents the host/hypervisor). This will will result in
the desired effect in both host and guest.
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250304120449.GHZ8bsYYyEBOKQIxBm@fat_crate.local
---
arch/x86/include/asm/tlb.h | 58 +++++++++++++++++++++-----------------
1 file changed, 32 insertions(+), 26 deletions(-)
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -33,6 +33,27 @@ enum addr_stride {
PMD_STRIDE = 1
};
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
+ *
+ * The first is used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_FLAG_VA BIT(0)
+#define INVLPGB_FLAG_PCID BIT(1)
+#define INVLPGB_FLAG_ASID BIT(2)
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
+
+/* The implied mode when all bits are clear: */
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
+
#ifdef CONFIG_BROADCAST_TLB_FLUSH
/*
* INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
@@ -40,14 +61,20 @@ enum addr_stride {
* The INVLPGB instruction is weakly ordered, and a batch of invalidations can
* be done in a parallel fashion.
*
- * The instruction takes the number of extra pages to invalidate, beyond
- * the first page, while __invlpgb gets the more human readable number of
- * pages to invalidate.
+ * The instruction takes the number of extra pages to invalidate, beyond the
+ * first page, while __invlpgb gets the more human readable number of pages to
+ * invalidate.
*
* The bits in rax[0:2] determine respectively which components of the address
* (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
* address in the specified range matches.
*
+ * Since it is desired to only flush TLB entries for the ASID that is executing
+ * the instruction (a host/hypervisor or a guest), the ASID valid bit should
+ * always be set. On a host/hypervisor, the hardware will use the ASID value
+ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will
+ * use the actual ASID value of the guest.
+ *
* TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
* this CPU have completed.
*/
@@ -55,9 +82,9 @@ static inline void __invlpgb(unsigned lo
unsigned long addr, u16 nr_pages,
enum addr_stride stride, u8 flags)
{
- u32 edx = (pcid << 16) | asid;
+ u64 rax = addr | flags | INVLPGB_FLAG_ASID;
u32 ecx = (stride << 31) | (nr_pages - 1);
- u64 rax = addr | flags;
+ u32 edx = (pcid << 16) | asid;
/* The low bits in rax are for flags. Verify addr is clean. */
VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
@@ -93,27 +120,6 @@ static inline void __invlpgb_all(unsigne
static inline void __tlbsync(void) { }
#endif
-/*
- * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
- * of the three. For example:
- * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
- * - FLAG_PCID: invalidate all TLB entries matching the PCID
- *
- * The first is used to invalidate (kernel) mappings at a particular
- * address across all processes.
- *
- * The latter invalidates all TLB entries matching a PCID.
- */
-#define INVLPGB_FLAG_VA BIT(0)
-#define INVLPGB_FLAG_PCID BIT(1)
-#define INVLPGB_FLAG_ASID BIT(2)
-#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
-#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
-#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
-
-/* The implied mode when all bits are clear: */
-#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
-
static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
unsigned long addr,
u16 nr, bool stride)

View File

@@ -0,0 +1,70 @@
From 0e0a5ca37a8e3b06f450f4093ba1b6d6f33c2161 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 19 Mar 2025 13:25:20 -0400
Subject: x86/mm: Only do broadcast flush from reclaim if pages were unmapped
Track whether pages were unmapped from any MM (even ones with a currently
empty mm_cpumask) by the reclaim code, to figure out whether or not
broadcast TLB flush should be done when reclaim finishes.
The reason any MM must be tracked, and not only ones contributing to the
tlbbatch cpumask, is that broadcast ASIDs are expected to be kept up to
date even on CPUs where the MM is not currently active.
This change allows reclaim to avoid doing TLB flushes when only clean page
cache pages and/or slab memory were reclaimed, which is fairly common.
( This is a simpler alternative to the code that was in my INVLPGB series
before, and it seems to capture most of the benefit due to how common
it is to reclaim only page cache. )
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20250319132520.6b10ad90@fangorn
---
arch/x86/include/asm/tlbbatch.h | 5 +++++
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 3 ++-
3 files changed, 8 insertions(+), 1 deletion(-)
--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -10,6 +10,11 @@ struct arch_tlbflush_unmap_batch {
* the PFNs being flushed..
*/
struct cpumask cpumask;
+ /*
+ * Set if pages were unmapped from any MM, even one that does not
+ * have active CPUs in its cpumask.
+ */
+ bool unmapped_pages;
};
#endif /* _ARCH_X86_TLBBATCH_H */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -353,6 +353,7 @@ static inline void arch_tlbbatch_add_pen
{
inc_mm_tlb_gen(mm);
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ batch->unmapped_pages = true;
mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1648,8 +1648,9 @@ void arch_tlbbatch_flush(struct arch_tlb
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
invlpgb_flush_all_nonglobals();
+ batch->unmapped_pages = false;
} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {

View File

@@ -0,0 +1,92 @@
From 6ae491224973eb4013ee67a8c05c420f057d5fee Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 8 May 2025 15:41:32 -0700
Subject: x86/mm: Eliminate window where TLB flushes may be inadvertently
skipped
tl;dr: There is a window in the mm switching code where the new CR3 is
set and the CPU should be getting TLB flushes for the new mm. But
should_flush_tlb() has a bug and suppresses the flush. Fix it by
widening the window where should_flush_tlb() sends an IPI.
Long Version:
=== History ===
There were a few things leading up to this.
First, updating mm_cpumask() was observed to be too expensive, so it was
made lazier. But being lazy caused too many unnecessary IPIs to CPUs
due to the now-lazy mm_cpumask(). So code was added to cull
mm_cpumask() periodically[2]. But that culling was a bit too aggressive
and skipped sending TLB flushes to CPUs that need them. So here we are
again.
=== Problem ===
The too-aggressive code in should_flush_tlb() strikes in this window:
// Turn on IPIs for this CPU/mm combination, but only
// if should_flush_tlb() agrees:
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
load_new_mm_cr3(need_flush);
// ^ After 'need_flush' is set to false, IPIs *MUST*
// be sent to this CPU and not be ignored.
this_cpu_write(cpu_tlbstate.loaded_mm, next);
// ^ Not until this point does should_flush_tlb()
// become true!
should_flush_tlb() will suppress TLB flushes between load_new_mm_cr3()
and writing to 'loaded_mm', which is a window where they should not be
suppressed. Whoops.
=== Solution ===
Thankfully, the fuzzy "just about to write CR3" window is already marked
with loaded_mm==LOADED_MM_SWITCHING. Simply checking for that state in
should_flush_tlb() is sufficient to ensure that the CPU is targeted with
an IPI.
This will cause more TLB flush IPIs. But the window is relatively small
and I do not expect this to cause any kind of measurable performance
impact.
Update the comment where LOADED_MM_SWITCHING is written since it grew
yet another user.
Peter Z also raised a concern that should_flush_tlb() might not observe
'loaded_mm' and 'is_lazy' in the same order that switch_mm_irqs_off()
writes them. Add a barrier to ensure that they are observed in the
order they are written.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Rik van Riel <riel@surriel.com>
Link: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ [1]
Fixes: 6db2526c1d69 ("x86/mm/tlb: Only trim the mm_cpumask once a second") [2]
Reported-by: Stephen Dolan <sdolan@janestreet.com>
Cc: stable@vger.kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
arch/x86/mm/tlb.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -900,8 +900,9 @@ void switch_mm_irqs_off(struct mm_struct
cond_mitigation(tsk);
/*
- * Let nmi_uaccess_okay() and finish_asid_transition()
- * know that CR3 is changing.
+ * Indicate that CR3 is about to change. nmi_uaccess_okay()
+ * and others are sensitive to the window where mm_cpumask(),
+ * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
*/
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();