1
0

release 6.12.10

This commit is contained in:
Konstantin Demin 2025-01-18 01:49:54 +03:00
parent d372c88caf
commit 5b35bedab1
20 changed files with 13 additions and 1423 deletions

View File

@ -7,7 +7,7 @@ w=$(git rev-parse --path-format=absolute --show-toplevel) ; : "${w:?}" ; cd "$w"
dst='debian/patches/pf-tmp'
src='../linux-extras'
branches='amd-pstate amd-rapl btrfs cpuidle crypto fixes invlpgb kbuild pksm xfs zstd'
branches='btrfs cpuidle crypto fixes kbuild pksm xfs zstd'
if [ -d "${dst}" ] ; then rm -rf "${dst}" ; fi
mkdir -p "${dst}"

9
debian/changelog vendored
View File

@ -1,8 +1,15 @@
linux (6.12.10-1) sid; urgency=medium
* New upstream stable update:
https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.12.10
-- Konstantin Demin <rockdrilla@gmail.com> Sat, 18 Jan 2025 01:39:50 +0300
linux (6.12.9-1) sid; urgency=medium
* New upstream stable update:
https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.12.9
* Drop "pf-amd-pstate" and "amd-rapl" patchsets.
* Drop "pf/amd-pstate" and "pf/amd-rapl" patchsets.
-- Konstantin Demin <rockdrilla@gmail.com> Thu, 09 Jan 2025 23:12:22 +0300

View File

@ -2491,6 +2491,8 @@ CONFIG_KEXEC_CORE=y
CONFIG_LZ4HC_COMPRESS=m
CONFIG_LZ4_COMPRESS=m
CONFIG_MFD_CORE=m
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
CONFIG_MMU_GATHER_TABLE_FREE=y
CONFIG_ND_BTT=m
CONFIG_ND_PFN=m
CONFIG_NETFS_SUPPORT=m

View File

@ -4064,6 +4064,8 @@ CONFIG_LZ4_COMPRESS=m
CONFIG_MAPPING_DIRTY_HELPERS=y
CONFIG_MCTP_FLOWS=y
CONFIG_MFD_CORE=m
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
CONFIG_MMU_GATHER_TABLE_FREE=y
CONFIG_MOUSE_PS2_SMBUS=y
CONFIG_ND_BTT=m
CONFIG_ND_PFN=m

View File

@ -3945,8 +3945,6 @@ CONFIG_MLX4_CORE=m
CONFIG_MMCONF_FAM10H=y
CONFIG_MMU=y
CONFIG_MMU_GATHER_MERGE_VMAS=y
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
CONFIG_MMU_GATHER_TABLE_FREE=y
CONFIG_MMU_LAZY_TLB_REFCOUNT=y
CONFIG_MMU_NOTIFIER=y
CONFIG_MODULES_TREE_LOOKUP=y

View File

@ -1,45 +0,0 @@
From 1c45e81769d174d02a26a61e3919313fa3b16120 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kaihengf@nvidia.com>
Date: Fri, 6 Dec 2024 15:48:17 +0800
Subject: USB: core: Disable LPM only for non-suspended ports
There's USB error when tegra board is shutting down:
[ 180.919315] usb 2-3: Failed to set U1 timeout to 0x0,error code -113
[ 180.919995] usb 2-3: Failed to set U1 timeout to 0xa,error code -113
[ 180.920512] usb 2-3: Failed to set U2 timeout to 0x4,error code -113
[ 186.157172] tegra-xusb 3610000.usb: xHCI host controller not responding, assume dead
[ 186.157858] tegra-xusb 3610000.usb: HC died; cleaning up
[ 186.317280] tegra-xusb 3610000.usb: Timeout while waiting for evaluate context command
The issue is caused by disabling LPM on already suspended ports.
For USB2 LPM, the LPM is already disabled during port suspend. For USB3
LPM, port won't transit to U1/U2 when it's already suspended in U3,
hence disabling LPM is only needed for ports that are not suspended.
Cc: Wayne Chang <waynec@nvidia.com>
Cc: stable@vger.kernel.org
Fixes: d920a2ed8620 ("usb: Disable USB3 LPM at shutdown")
Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
---
drivers/usb/core/port.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/drivers/usb/core/port.c
+++ b/drivers/usb/core/port.c
@@ -452,10 +452,11 @@ static int usb_port_runtime_suspend(stru
static void usb_port_shutdown(struct device *dev)
{
struct usb_port *port_dev = to_usb_port(dev);
+ struct usb_device *udev = port_dev->child;
- if (port_dev->child) {
- usb_disable_usb2_hardware_lpm(port_dev->child);
- usb_unlocked_disable_lpm(port_dev->child);
+ if (udev && !udev->port_is_suspended) {
+ usb_disable_usb2_hardware_lpm(udev);
+ usb_unlocked_disable_lpm(udev);
}
}

View File

@ -1,60 +0,0 @@
From 60fbdd9e9dc7074d4cd30ada3ba9547d5c007702 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:02 -0500
Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional
Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
paravirt, and not when running on bare metal.
There is no real good reason to do things differently for
each setup. Make them all the same.
After this change, the synchronization between get_user_pages_fast
and page table freeing is handled by RCU, which prevents page tables
from being reused for other data while get_user_pages_fast is walking
them.
This allows us to invalidate page tables while other CPUs have
interrupts disabled.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
---
arch/x86/Kconfig | 2 +-
arch/x86/kernel/paravirt.c | 7 +------
2 files changed, 2 insertions(+), 7 deletions(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -270,7 +270,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
+ select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void)
static_branch_enable(&virt_spin_lock_key);
}
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_page(tlb, table);
-}
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops =
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table = native_tlb_remove_table,
+ .mmu.tlb_remove_table = tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,

View File

@ -1,137 +0,0 @@
From 8966aff4928c0bc3aa79b8729d74da5ea782f73a Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:03 -0500
Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call
Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
Get rid of the indirection by simply calling tlb_remove_table directly,
and not going through the paravirt function pointers.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
arch/x86/hyperv/mmu.c | 1 -
arch/x86/include/asm/paravirt.h | 5 -----
arch/x86/include/asm/paravirt_types.h | 2 --
arch/x86/kernel/kvm.c | 1 -
arch/x86/kernel/paravirt.c | 1 -
arch/x86/mm/pgtable.c | 16 ++++------------
arch/x86/xen/mmu_pv.c | 1 -
7 files changed, 4 insertions(+), 23 deletions(-)
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void)
pr_info("Using hypercall for remote TLB flush\n");
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
}
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
}
-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
-}
-
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
PVOP_VCALL1(mmu.exit_mmap, mm);
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -136,8 +136,6 @@ struct pv_mmu_ops {
void (*flush_tlb_multi)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
-
/* Hook for intercepting the destruction of an mm_struct. */
void (*exit_mmap)(struct mm_struct *mm);
void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
if (pv_tlb_flush_supported()) {
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
pr_info("KVM setup pv remote TLB flush\n");
}
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops =
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table = tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask);
#define PGTABLE_HIGHMEM 0
#endif
-#ifndef CONFIG_PARAVIRT
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_page(tlb, table);
-}
-#endif
-
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
pgtable_t pte_alloc_one(struct mm_struct *mm)
@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather *
{
pagetable_pte_dtor(page_ptdesc(pte));
paravirt_release_pte(page_to_pfn(pte));
- paravirt_tlb_remove_table(tlb, pte);
+ tlb_remove_table(tlb, pte);
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather *
tlb->need_flush_all = 1;
#endif
pagetable_pmd_dtor(ptdesc);
- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
+ tlb_remove_table(tlb, ptdesc_page(ptdesc));
}
#if CONFIG_PGTABLE_LEVELS > 3
@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather *
pagetable_pud_dtor(ptdesc);
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_page(pud));
+ tlb_remove_table(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
+ tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2137,7 +2137,6 @@ static const typeof(pv_ops) xen_mmu_ops
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_one_user = xen_flush_tlb_one_user,
.flush_tlb_multi = xen_flush_tlb_multi,
- .tlb_remove_table = tlb_remove_table,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,

View File

@ -1,23 +0,0 @@
From efde57842082e36ab2e2be5a11c7b06ff9e18b3d Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:04 -0500
Subject: x86/mm: add X86_FEATURE_INVLPGB definition.
Add the INVPLGB CPUID definition, allowing the kernel to recognize
whether the CPU supports the INVLPGB instruction.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/cpufeatures.h | 1 +
1 file changed, 1 insertion(+)
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -335,6 +335,7 @@
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* "invlpgb" INVLPGB instruction */
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */

View File

@ -1,57 +0,0 @@
From 98953e10e342ceea1dc877cfb63318fa85879a59 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:05 -0500
Subject: x86/mm: get INVLPGB count max from CPUID
The CPU advertises the maximum number of pages that can be shot down
with one INVLPGB instruction in the CPUID data.
Save that information for later use.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/kernel/cpu/amd.c | 8 ++++++++
arch/x86/kernel/setup.c | 4 ++++
3 files changed, 13 insertions(+)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -182,6 +182,7 @@ static inline void cr4_init_shadow(void)
extern unsigned long mmu_cr4_features;
extern u32 *trampoline_cr4_features;
+extern u16 invlpgb_count_max;
extern void initialize_tlbstate_and_flush(void);
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1135,6 +1135,14 @@ static void cpu_detect_tlb_amd(struct cp
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+ if (c->extended_cpuid_level < 0x80000008)
+ return;
+
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ invlpgb_count_max = (edx & 0xffff) + 1;
}
static const struct cpu_dev amd_cpu_dev = {
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -138,6 +138,10 @@ __visible unsigned long mmu_cr4_features
__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
#endif
+#ifdef CONFIG_CPU_SUP_AMD
+u16 invlpgb_count_max __ro_after_init;
+#endif
+
#ifdef CONFIG_IMA
static phys_addr_t ima_kexec_buffer_phys;
static size_t ima_kexec_buffer_size;

View File

@ -1,121 +0,0 @@
From bc9d1fa1bd32dca78f38bd2a8557e7fc638308bd Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:06 -0500
Subject: x86/mm: add INVLPGB support code
Add invlpgb.h with the helper functions and definitions needed to use
broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/invlpgb.h | 93 +++++++++++++++++++++++++++++++++
arch/x86/include/asm/tlbflush.h | 1 +
2 files changed, 94 insertions(+)
create mode 100644 arch/x86/include/asm/invlpgb.h
--- /dev/null
+++ b/arch/x86/include/asm/invlpgb.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVLPGB
+#define _ASM_X86_INVLPGB
+
+#include <vdso/bits.h>
+
+/*
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+ *
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+ * be done in a parallel fashion.
+ *
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+ * this CPU have completed.
+ */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr,
+ int extra_count, bool pmd_stride, unsigned long flags)
+{
+ u64 rax = addr | flags;
+ u32 ecx = (pmd_stride << 31) | extra_count;
+ u32 edx = (pcid << 16) | asid;
+
+ asm volatile("invlpgb" : : "a" (rax), "c" (ecx), "d" (edx));
+}
+
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - INVLPGB_PCID: invalidate all TLB entries matching the PCID
+ *
+ * The first can be used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_VA BIT(0)
+#define INVLPGB_PCID BIT(1)
+#define INVLPGB_ASID BIT(2)
+#define INVLPGB_INCLUDE_GLOBAL BIT(3)
+#define INVLPGB_FINAL_ONLY BIT(4)
+#define INVLPGB_INCLUDE_NESTED BIT(5)
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invlpgb_flush_user(unsigned long pcid,
+ unsigned long addr)
+{
+ __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
+}
+
+static inline void invlpgb_flush_user_nr(unsigned long pcid, unsigned long addr,
+ int nr, bool pmd_stride)
+{
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+}
+
+/* Flush all mappings for a given ASID, not including globals. */
+static inline void invlpgb_flush_single_asid(unsigned long asid)
+{
+ __invlpgb(asid, 0, 0, 0, 0, INVLPGB_ASID);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invlpgb_flush_single_pcid(unsigned long pcid)
+{
+ __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invlpgb_flush_all(void)
+{
+ __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
+}
+
+/* Flush addr, including globals, for all PCIDs. */
+static inline void invlpgb_flush_addr(unsigned long addr, int nr)
+{
+ __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invlpgb_flush_all_nonglobals(void)
+{
+ __invlpgb(0, 0, 0, 0, 0, 0);
+}
+
+/* Wait for INVLPGB originated by this CPU to complete. */
+static inline void tlbsync(void)
+{
+ asm volatile("tlbsync");
+}
+
+#endif /* _ASM_X86_INVLPGB */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,6 +10,7 @@
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
#include <asm/smp.h>
+#include <asm/invlpgb.h>
#include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>

View File

@ -1,61 +0,0 @@
From ffd834c7140dc5fcaf96161c6d8c4601bb700afe Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:07 -0500
Subject: x86/mm: use INVLPGB for kernel TLB flushes
Use broadcast TLB invalidation for kernel addresses when available.
This stops us from having to send IPIs for kernel TLB flushes.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/mm/tlb.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1048,6 +1048,32 @@ void flush_tlb_all(void)
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
+static void broadcast_kernel_range_flush(unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+ unsigned long maxnr = invlpgb_count_max;
+ unsigned long threshold = tlb_single_page_flush_ceiling * maxnr;
+
+ /*
+ * TLBSYNC only waits for flushes originating on the same CPU.
+ * Disabling migration allows us to wait on all flushes.
+ */
+ guard(preempt)();
+
+ if (end == TLB_FLUSH_ALL ||
+ (end - start) > threshold << PAGE_SHIFT) {
+ invlpgb_flush_all();
+ } else {
+ unsigned long nr;
+ for (addr = start; addr < end; addr += nr << PAGE_SHIFT) {
+ nr = min((end - addr) >> PAGE_SHIFT, maxnr);
+ invlpgb_flush_addr(addr, nr);
+ }
+ }
+
+ tlbsync();
+}
+
static void do_kernel_range_flush(void *info)
{
struct flush_tlb_info *f = info;
@@ -1060,6 +1086,11 @@ static void do_kernel_range_flush(void *
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ broadcast_kernel_range_flush(start, end);
+ return;
+ }
+
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {

View File

@ -1,28 +0,0 @@
From 13fac8226036456c15c517c1dd77be5109a61da2 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:08 -0500
Subject: x86/tlb: use INVLPGB in flush_tlb_all
The flush_tlb_all() function is not used a whole lot, but we might
as well use broadcast TLB flushing there, too.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/mm/tlb.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1045,6 +1045,12 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ guard(preempt)();
+ invlpgb_flush_all();
+ tlbsync();
+ return;
+ }
on_each_cpu(do_flush_tlb_all, NULL, 1);
}

View File

@ -1,36 +0,0 @@
From 765d531296765e7fb2888c70cb56c0e25b459231 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:09 -0500
Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing
In the page reclaim code, we only track the CPU(s) where the TLB needs
to be flushed, rather than all the individual mappings that may be getting
invalidated.
Use broadcast TLB flushing when that is available.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/mm/tlb.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1281,8 +1281,16 @@ EXPORT_SYMBOL_GPL(__flush_tlb_all);
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
struct flush_tlb_info *info;
+ int cpu;
- int cpu = get_cpu();
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ guard(preempt)();
+ invlpgb_flush_all_nonglobals();
+ tlbsync();
+ return;
+ }
+
+ cpu = get_cpu();
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
TLB_GENERATION_INVALID);

View File

@ -1,508 +0,0 @@
From 8b23125a3200a330fb407133f33aeb9ad3232603 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:10 -0500
Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded
processes
Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3
and newer CPUs.
In order to not exhaust PCID space, and keep TLB flushes local for single
threaded processes, we only hand out broadcast ASIDs to processes active on
3 or more CPUs, and gradually increase the threshold as broadcast ASID space
is depleted.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/mmu.h | 6 +
arch/x86/include/asm/mmu_context.h | 12 ++
arch/x86/include/asm/tlbflush.h | 17 ++
arch/x86/mm/tlb.c | 310 ++++++++++++++++++++++++++++-
4 files changed, 336 insertions(+), 9 deletions(-)
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -46,6 +46,12 @@ typedef struct {
unsigned long flags;
#endif
+#ifdef CONFIG_CPU_SUP_AMD
+ struct list_head broadcast_asid_list;
+ u16 broadcast_asid;
+ bool asid_transition;
+#endif
+
#ifdef CONFIG_ADDRESS_MASKING
/* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
unsigned long lam_cr3_mask;
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+extern void destroy_context_free_broadcast_asid(struct mm_struct *mm);
+
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
@@ -160,6 +162,13 @@ static inline int init_new_context(struc
mm->context.execute_only_pkey = -1;
}
#endif
+
+#ifdef CONFIG_CPU_SUP_AMD
+ INIT_LIST_HEAD(&mm->context.broadcast_asid_list);
+ mm->context.broadcast_asid = 0;
+ mm->context.asid_transition = false;
+#endif
+
mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
@@ -169,6 +178,9 @@ static inline int init_new_context(struc
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
+#ifdef CONFIG_CPU_SUP_AMD
+ destroy_context_free_broadcast_asid(mm);
+#endif
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -65,6 +65,23 @@ static inline void cr4_clear_bits(unsign
*/
#define TLB_NR_DYN_ASIDS 6
+#ifdef CONFIG_CPU_SUP_AMD
+#define is_dyn_asid(asid) (asid) < TLB_NR_DYN_ASIDS
+#define is_broadcast_asid(asid) (asid) >= TLB_NR_DYN_ASIDS
+#define in_asid_transition(info) (info->mm && info->mm->context.asid_transition)
+#define mm_broadcast_asid(mm) (mm->context.broadcast_asid)
+#else
+#define is_dyn_asid(asid) true
+#define is_broadcast_asid(asid) false
+#define in_asid_transition(info) false
+#define mm_broadcast_asid(mm) 0
+
+inline bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid)
+{
+ return false;
+}
+#endif
+
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -74,13 +74,15 @@
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
- * the canonical identifier for an mm
+ * the canonical identifier for an mm, dynamically allocated on each CPU
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
+ * the canonical, global identifier for an mm, identical across all CPUs
*
- * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * kPCID - [1, MAX_ASID_AVAILABLE]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
@@ -225,6 +227,18 @@ static void choose_new_asid(struct mm_st
return;
}
+ /*
+ * TLB consistency for this ASID is maintained with INVLPGB;
+ * TLB flushes happen even while the process isn't running.
+ */
+#ifdef CONFIG_CPU_SUP_AMD
+ if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_broadcast_asid(next)) {
+ *new_asid = mm_broadcast_asid(next);
+ *need_flush = false;
+ return;
+ }
+#endif
+
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
@@ -251,6 +265,245 @@ static void choose_new_asid(struct mm_st
*need_flush = true;
}
+#ifdef CONFIG_CPU_SUP_AMD
+/*
+ * Logic for AMD INVLPGB support.
+ */
+static DEFINE_RAW_SPINLOCK(broadcast_asid_lock);
+static u16 last_broadcast_asid = TLB_NR_DYN_ASIDS;
+static DECLARE_BITMAP(broadcast_asid_used, MAX_ASID_AVAILABLE) = { 0 };
+static LIST_HEAD(broadcast_asid_list);
+static int broadcast_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
+
+static void reset_broadcast_asid_space(void)
+{
+ mm_context_t *context;
+
+ lockdep_assert_held(&broadcast_asid_lock);
+
+ /*
+ * Flush once when we wrap around the ASID space, so we won't need
+ * to flush every time we allocate an ASID for boradcast flushing.
+ */
+ invlpgb_flush_all_nonglobals();
+ tlbsync();
+
+ /*
+ * Leave the currently used broadcast ASIDs set in the bitmap, since
+ * those cannot be reused before the next wraparound and flush..
+ */
+ bitmap_clear(broadcast_asid_used, 0, MAX_ASID_AVAILABLE);
+ list_for_each_entry(context, &broadcast_asid_list, broadcast_asid_list)
+ __set_bit(context->broadcast_asid, broadcast_asid_used);
+
+ last_broadcast_asid = TLB_NR_DYN_ASIDS;
+}
+
+static u16 get_broadcast_asid(void)
+{
+ lockdep_assert_held(&broadcast_asid_lock);
+
+ do {
+ u16 start = last_broadcast_asid;
+ u16 asid = find_next_zero_bit(broadcast_asid_used, MAX_ASID_AVAILABLE, start);
+
+ if (asid >= MAX_ASID_AVAILABLE) {
+ reset_broadcast_asid_space();
+ continue;
+ }
+
+ /* Try claiming this broadcast ASID. */
+ if (!test_and_set_bit(asid, broadcast_asid_used)) {
+ last_broadcast_asid = asid;
+ return asid;
+ }
+ } while (1);
+}
+
+/*
+ * Returns true if the mm is transitioning from a CPU-local ASID to a broadcast
+ * (INVLPGB) ASID, or the other way around.
+ */
+static bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid)
+{
+ u16 broadcast_asid = mm_broadcast_asid(next);
+
+ if (broadcast_asid && prev_asid != broadcast_asid)
+ return true;
+
+ if (!broadcast_asid && is_broadcast_asid(prev_asid))
+ return true;
+
+ return false;
+}
+
+void destroy_context_free_broadcast_asid(struct mm_struct *mm)
+{
+ if (!mm->context.broadcast_asid)
+ return;
+
+ guard(raw_spinlock_irqsave)(&broadcast_asid_lock);
+ mm->context.broadcast_asid = 0;
+ list_del(&mm->context.broadcast_asid_list);
+ broadcast_asid_available++;
+}
+
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
+{
+ int count = 0;
+ int cpu;
+
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
+ return false;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /* Skip the CPUs that aren't really running this process. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
+ continue;
+
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ continue;
+
+ if (++count > threshold)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Assign a broadcast ASID to the current process, protecting against
+ * races between multiple threads in the process.
+ */
+static void use_broadcast_asid(struct mm_struct *mm)
+{
+ guard(raw_spinlock_irqsave)(&broadcast_asid_lock);
+
+ /* This process is already using broadcast TLB invalidation. */
+ if (mm->context.broadcast_asid)
+ return;
+
+ mm->context.broadcast_asid = get_broadcast_asid();
+ mm->context.asid_transition = true;
+ list_add(&mm->context.broadcast_asid_list, &broadcast_asid_list);
+ broadcast_asid_available--;
+}
+
+/*
+ * Figure out whether to assign a broadcast (global) ASID to a process.
+ * We vary the threshold by how empty or full broadcast ASID space is.
+ * 1/4 full: >= 4 active threads
+ * 1/2 full: >= 8 active threads
+ * 3/4 full: >= 16 active threads
+ * 7/8 full: >= 32 active threads
+ * etc
+ *
+ * This way we should never exhaust the broadcast ASID space, even on very
+ * large systems, and the processes with the largest number of active
+ * threads should be able to use broadcast TLB invalidation.
+ */
+#define HALFFULL_THRESHOLD 8
+static bool meets_broadcast_asid_threshold(struct mm_struct *mm)
+{
+ int avail = broadcast_asid_available;
+ int threshold = HALFFULL_THRESHOLD;
+
+ if (!avail)
+ return false;
+
+ if (avail > MAX_ASID_AVAILABLE * 3 / 4) {
+ threshold = HALFFULL_THRESHOLD / 4;
+ } else if (avail > MAX_ASID_AVAILABLE / 2) {
+ threshold = HALFFULL_THRESHOLD / 2;
+ } else if (avail < MAX_ASID_AVAILABLE / 3) {
+ do {
+ avail *= 2;
+ threshold *= 2;
+ } while ((avail + threshold) < MAX_ASID_AVAILABLE / 2);
+ }
+
+ return mm_active_cpus_exceeds(mm, threshold);
+}
+
+static void count_tlb_flush(struct mm_struct *mm)
+{
+ if (!static_cpu_has(X86_FEATURE_INVLPGB))
+ return;
+
+ /* Check every once in a while. */
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
+ return;
+
+ if (meets_broadcast_asid_threshold(mm))
+ use_broadcast_asid(mm);
+}
+
+static void finish_asid_transition(struct flush_tlb_info *info)
+{
+ struct mm_struct *mm = info->mm;
+ int bc_asid = mm_broadcast_asid(mm);
+ int cpu;
+
+ if (!mm->context.asid_transition)
+ return;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
+ continue;
+
+ /*
+ * If at least one CPU is not using the broadcast ASID yet,
+ * send a TLB flush IPI. The IPI should cause stragglers
+ * to transition soon.
+ */
+ if (per_cpu(cpu_tlbstate.loaded_mm_asid, cpu) != bc_asid) {
+ flush_tlb_multi(mm_cpumask(info->mm), info);
+ return;
+ }
+ }
+
+ /* All the CPUs running this process are using the broadcast ASID. */
+ mm->context.asid_transition = 0;
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ bool pmd = info->stride_shift == PMD_SHIFT;
+ unsigned long maxnr = invlpgb_count_max;
+ unsigned long asid = info->mm->context.broadcast_asid;
+ unsigned long addr = info->start;
+ unsigned long nr;
+
+ /* Flushing multiple pages at once is not supported with 1GB pages. */
+ if (info->stride_shift > PMD_SHIFT)
+ maxnr = 1;
+
+ if (info->end == TLB_FLUSH_ALL) {
+ invlpgb_flush_single_pcid(kern_pcid(asid));
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_single_pcid(user_pcid(asid));
+ } else do {
+ /*
+ * Calculate how many pages can be flushed at once; if the
+ * remainder of the range is less than one page, flush one.
+ */
+ nr = min(maxnr, (info->end - addr) >> info->stride_shift);
+ nr = max(nr, 1);
+
+ invlpgb_flush_user_nr(kern_pcid(asid), addr, nr, pmd);
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr(user_pcid(asid), addr, nr, pmd);
+ addr += nr << info->stride_shift;
+ } while (addr < info->end);
+
+ finish_asid_transition(info);
+
+ /* Wait for the INVLPGBs kicked off above to finish. */
+ tlbsync();
+}
+#endif /* CONFIG_CPU_SUP_AMD */
+
/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
@@ -556,8 +809,9 @@ void switch_mm_irqs_off(struct mm_struct
*/
if (prev == next) {
/* Not actually switching mm's */
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
- next->context.ctx_id);
+ if (is_dyn_asid(prev_asid))
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ next->context.ctx_id);
/*
* If this races with another thread that enables lam, 'new_lam'
@@ -574,6 +828,23 @@ void switch_mm_irqs_off(struct mm_struct
cpumask_set_cpu(cpu, mm_cpumask(next));
/*
+ * Check if the current mm is transitioning to a new ASID.
+ */
+ if (needs_broadcast_asid_reload(next, prev_asid)) {
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ goto reload_tlb;
+ }
+
+ /*
+ * Broadcast TLB invalidation keeps this PCID up to date
+ * all the time.
+ */
+ if (is_broadcast_asid(prev_asid))
+ return;
+
+ /*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
* process. No TLB flush required.
@@ -629,8 +900,10 @@ void switch_mm_irqs_off(struct mm_struct
barrier();
}
+reload_tlb:
new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
+ VM_BUG_ON(is_broadcast_asid(new_asid));
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
@@ -749,7 +1022,7 @@ static void flush_tlb_func(void *info)
const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ u64 local_tlb_gen;
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
u64 mm_tlb_gen;
@@ -769,6 +1042,16 @@ static void flush_tlb_func(void *info)
if (unlikely(loaded_mm == &init_mm))
return;
+ /* Reload the ASID if transitioning into or out of a broadcast ASID */
+ if (needs_broadcast_asid_reload(loaded_mm, loaded_mm_asid)) {
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ }
+
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
+ if (is_broadcast_asid(loaded_mm_asid))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
@@ -786,6 +1069,8 @@ static void flush_tlb_func(void *info)
return;
}
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) {
/*
@@ -926,7 +1211,7 @@ STATIC_NOPV void native_flush_tlb_multi(
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables)
+ if (info->freed_tables || in_asid_transition(info))
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
@@ -998,14 +1283,18 @@ void flush_tlb_mm_range(struct mm_struct
bool freed_tables)
{
struct flush_tlb_info *info;
+ unsigned long threshold = tlb_single_page_flush_ceiling;
u64 new_tlb_gen;
int cpu;
+ if (static_cpu_has(X86_FEATURE_INVLPGB))
+ threshold *= invlpgb_count_max;
+
cpu = get_cpu();
/* Should we flush just the requested range? */
if ((end == TLB_FLUSH_ALL) ||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
+ ((end - start) >> stride_shift) > threshold) {
start = 0;
end = TLB_FLUSH_ALL;
}
@@ -1021,8 +1310,11 @@ void flush_tlb_mm_range(struct mm_struct
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ if (IS_ENABLED(CONFIG_CPU_SUP_AMD) && mm_broadcast_asid(mm)) {
+ broadcast_tlb_flush(info);
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
flush_tlb_multi(mm_cpumask(mm), info);
+ count_tlb_flush(mm);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();

View File

@ -1,126 +0,0 @@
From 1767a2786ebbe3451f973df44485309c2a8fd8a5 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:11 -0500
Subject: x86,tlb: do targeted broadcast flushing from tlbbatch code
Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
This also allows us to avoid adding the CPUs of processes using broadcast
flushing to the batch->cpumask, and will hopefully further reduce TLB
flushing from the reclaim and compaction paths.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/tlbbatch.h | 1 +
arch/x86/include/asm/tlbflush.h | 12 +++------
arch/x86/mm/tlb.c | 48 ++++++++++++++++++++++++++-------
3 files changed, 42 insertions(+), 19 deletions(-)
--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch {
* the PFNs being flushed..
*/
struct cpumask cpumask;
+ bool used_invlpgb;
};
#endif /* _ARCH_X86_TLBBATCH_H */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -296,21 +296,15 @@ static inline u64 inc_mm_tlb_gen(struct
return atomic64_inc_return(&mm->context.tlb_gen);
}
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
- struct mm_struct *mm,
- unsigned long uaddr)
-{
- inc_mm_tlb_gen(mm);
- cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
- mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
-}
-
static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
{
flush_tlb_mm(mm);
}
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr);
static inline bool pte_flags_need_flush(unsigned long oldflags,
unsigned long newflags,
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1573,16 +1573,7 @@ EXPORT_SYMBOL_GPL(__flush_tlb_all);
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
struct flush_tlb_info *info;
- int cpu;
-
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
- guard(preempt)();
- invlpgb_flush_all_nonglobals();
- tlbsync();
- return;
- }
-
- cpu = get_cpu();
+ int cpu = get_cpu();
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
TLB_GENERATION_INVALID);
@@ -1600,12 +1591,49 @@ void arch_tlbbatch_flush(struct arch_tlb
local_irq_enable();
}
+ /*
+ * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+ * The cpumask above contains only CPUs that were running tasks
+ * not using broadcast TLB flushing.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
+ tlbsync();
+ migrate_enable();
+ batch->used_invlpgb = false;
+ }
+
cpumask_clear(&batch->cpumask);
put_flush_tlb_info();
put_cpu();
}
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr)
+{
+ if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_broadcast_asid(mm)) {
+ u16 asid = mm_broadcast_asid(mm);
+ /*
+ * Queue up an asynchronous invalidation. The corresponding
+ * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
+ * on the same CPU.
+ */
+ if (!batch->used_invlpgb) {
+ batch->used_invlpgb = true;
+ migrate_disable();
+ }
+ invlpgb_flush_user_nr(kern_pcid(asid), uaddr, 1, 0);
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr(user_pcid(asid), uaddr, 1, 0);
+ } else {
+ inc_mm_tlb_gen(mm);
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ }
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
/*
* Blindly accessing user memory from NMI context can be dangerous
* if we're in the middle of switching the current user task or

View File

@ -1,82 +0,0 @@
From 13faf551d1a146ed18c448babe1953def4ed3d56 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:12 -0500
Subject: x86/mm: enable AMD translation cache extensions
With AMD TCE (translation cache extensions) only the intermediate mappings
that cover the address range zapped by INVLPG / INVLPGB get invalidated,
rather than all intermediate mappings getting zapped at every TLB invalidation.
This can help reduce the TLB miss rate, by keeping more intermediate
mappings in the cache.
>From the AMD manual:
Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit
to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on
TLB entries. When this bit is 0, these instructions remove the target PTE
from the TLB as well as all upper-level table entries that are cached
in the TLB, whether or not they are associated with the target PTE.
When this bit is set, these instructions will remove the target PTE and
only those upper-level entries that lead to the target PTE in
the page table hierarchy, leaving unrelated upper-level entries intact.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/kernel/cpu/amd.c | 8 ++++++++
arch/x86/mm/tlb.c | 10 +++++++---
2 files changed, 15 insertions(+), 3 deletions(-)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1143,6 +1143,14 @@ static void cpu_detect_tlb_amd(struct cp
/* Max number of pages INVLPGB can invalidate in one shot */
invlpgb_count_max = (edx & 0xffff) + 1;
+
+ /* If supported, enable translation cache extensions (TCE) */
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+ if (ecx & BIT(17)) {
+ u64 msr = native_read_msr(MSR_EFER);;
+ msr |= BIT(15);
+ wrmsrl(MSR_EFER, msr);
+ }
}
static const struct cpu_dev amd_cpu_dev = {
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -477,7 +477,7 @@ static void broadcast_tlb_flush(struct f
if (info->stride_shift > PMD_SHIFT)
maxnr = 1;
- if (info->end == TLB_FLUSH_ALL) {
+ if (info->end == TLB_FLUSH_ALL || info->freed_tables) {
invlpgb_flush_single_pcid(kern_pcid(asid));
/* Do any CPUs supporting INVLPGB need PTI? */
if (static_cpu_has(X86_FEATURE_PTI))
@@ -1110,7 +1110,7 @@ static void flush_tlb_func(void *info)
*
* The only question is whether to do a full or partial flush.
*
- * We do a partial flush if requested and two extra conditions
+ * We do a partial flush if requested and three extra conditions
* are met:
*
* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
@@ -1137,10 +1137,14 @@ static void flush_tlb_func(void *info)
* date. By doing a full flush instead, we can increase
* local_tlb_gen all the way to mm_tlb_gen and we can probably
* avoid another flush in the very near future.
+ *
+ * 3. No page tables were freed. If page tables were freed, a full
+ * flush ensures intermediate translations in the TLB get flushed.
*/
if (f->end != TLB_FLUSH_ALL &&
f->new_tlb_gen == local_tlb_gen + 1 &&
- f->new_tlb_gen == mm_tlb_gen) {
+ f->new_tlb_gen == mm_tlb_gen &&
+ !f->freed_tables) {
/* Partial flush */
unsigned long addr = f->start;

View File

@ -1,28 +0,0 @@
From 2fc0be5fbcee1a62162b699451bb94f90ec64244 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 30 Dec 2024 12:53:13 -0500
Subject: x86/mm: only invalidate final translations with INVLPGB
Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB.
This way only leaf mappings get removed from the TLB, leaving intermediate
translations cached.
On the (rare) occasions where we free page tables we do a full flush,
ensuring intermediate translations get flushed from the TLB.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/invlpgb.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/x86/include/asm/invlpgb.h
+++ b/arch/x86/include/asm/invlpgb.h
@@ -51,7 +51,7 @@ static inline void invlpgb_flush_user(un
static inline void invlpgb_flush_user_nr(unsigned long pcid, unsigned long addr,
int nr, bool pmd_stride)
{
- __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA | INVLPGB_FINAL_ONLY);
}
/* Flush all mappings for a given ASID, not including globals. */

View File

@ -1,92 +0,0 @@
From a3ff46a157cadb29349c5b388fc70804c351e561 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 19 Dec 2024 15:32:53 -0500
Subject: mm: remove unnecessary calls to lru_add_drain
There seem to be several categories of calls to lru_add_drain
and lru_add_drain_all.
The first are code paths that recently allocated, swapped in,
or otherwise processed a batch of pages, and want them all on
the LRU. These drain pages that were recently allocated,
probably on the local CPU.
A second category are code paths that are actively trying to
reclaim, migrate, or offline memory. These often use lru_add_drain_all,
to drain the caches on all CPUs.
However, there also seem to be some other callers where we
aren't really doing either. They are calling lru_add_drain(),
despite operating on pages that may have been allocated
long ago, and quite possibly on different CPUs.
Those calls are not likely to be effective at anything but
creating lock contention on the LRU locks.
Remove the lru_add_drain calls in the latter category.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: David Hildenbrand <david@redhat.com>
---
mm/memory.c | 1 -
mm/mmap.c | 2 --
mm/swap_state.c | 1 -
mm/vma.c | 2 --
4 files changed, 6 deletions(-)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are
struct mmu_notifier_range range;
struct mmu_gather tlb;
- lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1931,7 +1931,6 @@ void exit_mmap(struct mm_struct *mm)
goto destroy;
}
- lru_add_drain();
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
@@ -2374,7 +2373,6 @@ int relocate_vma_down(struct vm_area_str
vma, new_start, length, false, true))
return -ENOMEM;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
next = vma_next(&vmi);
if (new_end > old_start) {
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en
struct folio_batch folios;
unsigned int refs[PAGEVEC_SIZE];
- lru_add_drain();
folio_batch_init(&folios);
for (int i = 0; i < nr; i++) {
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas,
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct
* were isolated before we downgraded mmap_lock.
*/
mas_set(mas_detach, 1);
- lru_add_drain();
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
update_hiwater_rss(vms->vma->vm_mm);
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,

15
debian/patches/series vendored
View File

@ -173,20 +173,6 @@ patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.pat
patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch
patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch
patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch
patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch
patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
patchset-pf/pksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch
patchset-pf/xfs/0001-xfs-fix-chown-with-rt-quota.patch
@ -261,6 +247,5 @@ patchset-zen/sauce/0024-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.pat
patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch
patchset-pf/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch
patchset-pf/fixes/0003-USB-core-Disable-LPM-only-for-non-suspended-ports.patch
patchset-zen/fixes/0001-futex-improve-user-space-accesses.patch