release 6.12.10
This commit is contained in:
parent
d372c88caf
commit
5b35bedab1
2
debian/bin/genpatch-pfkernel
vendored
2
debian/bin/genpatch-pfkernel
vendored
@ -7,7 +7,7 @@ w=$(git rev-parse --path-format=absolute --show-toplevel) ; : "${w:?}" ; cd "$w"
|
||||
|
||||
dst='debian/patches/pf-tmp'
|
||||
src='../linux-extras'
|
||||
branches='amd-pstate amd-rapl btrfs cpuidle crypto fixes invlpgb kbuild pksm xfs zstd'
|
||||
branches='btrfs cpuidle crypto fixes kbuild pksm xfs zstd'
|
||||
|
||||
if [ -d "${dst}" ] ; then rm -rf "${dst}" ; fi
|
||||
mkdir -p "${dst}"
|
||||
|
9
debian/changelog
vendored
9
debian/changelog
vendored
@ -1,8 +1,15 @@
|
||||
linux (6.12.10-1) sid; urgency=medium
|
||||
|
||||
* New upstream stable update:
|
||||
https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.12.10
|
||||
|
||||
-- Konstantin Demin <rockdrilla@gmail.com> Sat, 18 Jan 2025 01:39:50 +0300
|
||||
|
||||
linux (6.12.9-1) sid; urgency=medium
|
||||
|
||||
* New upstream stable update:
|
||||
https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.12.9
|
||||
* Drop "pf-amd-pstate" and "amd-rapl" patchsets.
|
||||
* Drop "pf/amd-pstate" and "pf/amd-rapl" patchsets.
|
||||
|
||||
-- Konstantin Demin <rockdrilla@gmail.com> Thu, 09 Jan 2025 23:12:22 +0300
|
||||
|
||||
|
2
debian/config/amd64/config.cloud
vendored
2
debian/config/amd64/config.cloud
vendored
@ -2491,6 +2491,8 @@ CONFIG_KEXEC_CORE=y
|
||||
CONFIG_LZ4HC_COMPRESS=m
|
||||
CONFIG_LZ4_COMPRESS=m
|
||||
CONFIG_MFD_CORE=m
|
||||
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
|
||||
CONFIG_MMU_GATHER_TABLE_FREE=y
|
||||
CONFIG_ND_BTT=m
|
||||
CONFIG_ND_PFN=m
|
||||
CONFIG_NETFS_SUPPORT=m
|
||||
|
2
debian/config/amd64/config.vm
vendored
2
debian/config/amd64/config.vm
vendored
@ -4064,6 +4064,8 @@ CONFIG_LZ4_COMPRESS=m
|
||||
CONFIG_MAPPING_DIRTY_HELPERS=y
|
||||
CONFIG_MCTP_FLOWS=y
|
||||
CONFIG_MFD_CORE=m
|
||||
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
|
||||
CONFIG_MMU_GATHER_TABLE_FREE=y
|
||||
CONFIG_MOUSE_PS2_SMBUS=y
|
||||
CONFIG_ND_BTT=m
|
||||
CONFIG_ND_PFN=m
|
||||
|
2
debian/config/config
vendored
2
debian/config/config
vendored
@ -3945,8 +3945,6 @@ CONFIG_MLX4_CORE=m
|
||||
CONFIG_MMCONF_FAM10H=y
|
||||
CONFIG_MMU=y
|
||||
CONFIG_MMU_GATHER_MERGE_VMAS=y
|
||||
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
|
||||
CONFIG_MMU_GATHER_TABLE_FREE=y
|
||||
CONFIG_MMU_LAZY_TLB_REFCOUNT=y
|
||||
CONFIG_MMU_NOTIFIER=y
|
||||
CONFIG_MODULES_TREE_LOOKUP=y
|
||||
|
@ -1,45 +0,0 @@
|
||||
From 1c45e81769d174d02a26a61e3919313fa3b16120 Mon Sep 17 00:00:00 2001
|
||||
From: Kai-Heng Feng <kaihengf@nvidia.com>
|
||||
Date: Fri, 6 Dec 2024 15:48:17 +0800
|
||||
Subject: USB: core: Disable LPM only for non-suspended ports
|
||||
|
||||
There's USB error when tegra board is shutting down:
|
||||
[ 180.919315] usb 2-3: Failed to set U1 timeout to 0x0,error code -113
|
||||
[ 180.919995] usb 2-3: Failed to set U1 timeout to 0xa,error code -113
|
||||
[ 180.920512] usb 2-3: Failed to set U2 timeout to 0x4,error code -113
|
||||
[ 186.157172] tegra-xusb 3610000.usb: xHCI host controller not responding, assume dead
|
||||
[ 186.157858] tegra-xusb 3610000.usb: HC died; cleaning up
|
||||
[ 186.317280] tegra-xusb 3610000.usb: Timeout while waiting for evaluate context command
|
||||
|
||||
The issue is caused by disabling LPM on already suspended ports.
|
||||
|
||||
For USB2 LPM, the LPM is already disabled during port suspend. For USB3
|
||||
LPM, port won't transit to U1/U2 when it's already suspended in U3,
|
||||
hence disabling LPM is only needed for ports that are not suspended.
|
||||
|
||||
Cc: Wayne Chang <waynec@nvidia.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Fixes: d920a2ed8620 ("usb: Disable USB3 LPM at shutdown")
|
||||
Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com>
|
||||
Acked-by: Alan Stern <stern@rowland.harvard.edu>
|
||||
---
|
||||
drivers/usb/core/port.c | 7 ++++---
|
||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/drivers/usb/core/port.c
|
||||
+++ b/drivers/usb/core/port.c
|
||||
@@ -452,10 +452,11 @@ static int usb_port_runtime_suspend(stru
|
||||
static void usb_port_shutdown(struct device *dev)
|
||||
{
|
||||
struct usb_port *port_dev = to_usb_port(dev);
|
||||
+ struct usb_device *udev = port_dev->child;
|
||||
|
||||
- if (port_dev->child) {
|
||||
- usb_disable_usb2_hardware_lpm(port_dev->child);
|
||||
- usb_unlocked_disable_lpm(port_dev->child);
|
||||
+ if (udev && !udev->port_is_suspended) {
|
||||
+ usb_disable_usb2_hardware_lpm(udev);
|
||||
+ usb_unlocked_disable_lpm(udev);
|
||||
}
|
||||
}
|
||||
|
@ -1,60 +0,0 @@
|
||||
From 60fbdd9e9dc7074d4cd30ada3ba9547d5c007702 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:02 -0500
|
||||
Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional
|
||||
|
||||
Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
|
||||
paravirt, and not when running on bare metal.
|
||||
|
||||
There is no real good reason to do things differently for
|
||||
each setup. Make them all the same.
|
||||
|
||||
After this change, the synchronization between get_user_pages_fast
|
||||
and page table freeing is handled by RCU, which prevents page tables
|
||||
from being reused for other data while get_user_pages_fast is walking
|
||||
them.
|
||||
|
||||
This allows us to invalidate page tables while other CPUs have
|
||||
interrupts disabled.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Suggested-by: Peter Zijlstra <peterz@infradead.org>
|
||||
---
|
||||
arch/x86/Kconfig | 2 +-
|
||||
arch/x86/kernel/paravirt.c | 7 +------
|
||||
2 files changed, 2 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/arch/x86/Kconfig
|
||||
+++ b/arch/x86/Kconfig
|
||||
@@ -270,7 +270,7 @@ config X86
|
||||
select HAVE_PCI
|
||||
select HAVE_PERF_REGS
|
||||
select HAVE_PERF_USER_STACK_DUMP
|
||||
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
|
||||
+ select MMU_GATHER_RCU_TABLE_FREE
|
||||
select MMU_GATHER_MERGE_VMAS
|
||||
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
|
||||
select HAVE_REGS_AND_STACK_ACCESS_API
|
||||
--- a/arch/x86/kernel/paravirt.c
|
||||
+++ b/arch/x86/kernel/paravirt.c
|
||||
@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void)
|
||||
static_branch_enable(&virt_spin_lock_key);
|
||||
}
|
||||
|
||||
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- tlb_remove_page(tlb, table);
|
||||
-}
|
||||
-
|
||||
struct static_key paravirt_steal_enabled;
|
||||
struct static_key paravirt_steal_rq_enabled;
|
||||
|
||||
@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops =
|
||||
.mmu.flush_tlb_kernel = native_flush_tlb_global,
|
||||
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
|
||||
.mmu.flush_tlb_multi = native_flush_tlb_multi,
|
||||
- .mmu.tlb_remove_table = native_tlb_remove_table,
|
||||
+ .mmu.tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.mmu.exit_mmap = paravirt_nop,
|
||||
.mmu.notify_page_enc_status_changed = paravirt_nop,
|
@ -1,137 +0,0 @@
|
||||
From 8966aff4928c0bc3aa79b8729d74da5ea782f73a Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:03 -0500
|
||||
Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call
|
||||
|
||||
Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
|
||||
|
||||
Get rid of the indirection by simply calling tlb_remove_table directly,
|
||||
and not going through the paravirt function pointers.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
---
|
||||
arch/x86/hyperv/mmu.c | 1 -
|
||||
arch/x86/include/asm/paravirt.h | 5 -----
|
||||
arch/x86/include/asm/paravirt_types.h | 2 --
|
||||
arch/x86/kernel/kvm.c | 1 -
|
||||
arch/x86/kernel/paravirt.c | 1 -
|
||||
arch/x86/mm/pgtable.c | 16 ++++------------
|
||||
arch/x86/xen/mmu_pv.c | 1 -
|
||||
7 files changed, 4 insertions(+), 23 deletions(-)
|
||||
|
||||
--- a/arch/x86/hyperv/mmu.c
|
||||
+++ b/arch/x86/hyperv/mmu.c
|
||||
@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void)
|
||||
|
||||
pr_info("Using hypercall for remote TLB flush\n");
|
||||
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
|
||||
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
|
||||
}
|
||||
--- a/arch/x86/include/asm/paravirt.h
|
||||
+++ b/arch/x86/include/asm/paravirt.h
|
||||
@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
|
||||
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
|
||||
}
|
||||
|
||||
-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
|
||||
-}
|
||||
-
|
||||
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
|
||||
{
|
||||
PVOP_VCALL1(mmu.exit_mmap, mm);
|
||||
--- a/arch/x86/include/asm/paravirt_types.h
|
||||
+++ b/arch/x86/include/asm/paravirt_types.h
|
||||
@@ -136,8 +136,6 @@ struct pv_mmu_ops {
|
||||
void (*flush_tlb_multi)(const struct cpumask *cpus,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
|
||||
-
|
||||
/* Hook for intercepting the destruction of an mm_struct. */
|
||||
void (*exit_mmap)(struct mm_struct *mm);
|
||||
void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
|
||||
--- a/arch/x86/kernel/kvm.c
|
||||
+++ b/arch/x86/kernel/kvm.c
|
||||
@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
|
||||
#ifdef CONFIG_SMP
|
||||
if (pv_tlb_flush_supported()) {
|
||||
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
|
||||
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
|
||||
pr_info("KVM setup pv remote TLB flush\n");
|
||||
}
|
||||
|
||||
--- a/arch/x86/kernel/paravirt.c
|
||||
+++ b/arch/x86/kernel/paravirt.c
|
||||
@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops =
|
||||
.mmu.flush_tlb_kernel = native_flush_tlb_global,
|
||||
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
|
||||
.mmu.flush_tlb_multi = native_flush_tlb_multi,
|
||||
- .mmu.tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.mmu.exit_mmap = paravirt_nop,
|
||||
.mmu.notify_page_enc_status_changed = paravirt_nop,
|
||||
--- a/arch/x86/mm/pgtable.c
|
||||
+++ b/arch/x86/mm/pgtable.c
|
||||
@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask);
|
||||
#define PGTABLE_HIGHMEM 0
|
||||
#endif
|
||||
|
||||
-#ifndef CONFIG_PARAVIRT
|
||||
-static inline
|
||||
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- tlb_remove_page(tlb, table);
|
||||
-}
|
||||
-#endif
|
||||
-
|
||||
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
|
||||
|
||||
pgtable_t pte_alloc_one(struct mm_struct *mm)
|
||||
@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather *
|
||||
{
|
||||
pagetable_pte_dtor(page_ptdesc(pte));
|
||||
paravirt_release_pte(page_to_pfn(pte));
|
||||
- paravirt_tlb_remove_table(tlb, pte);
|
||||
+ tlb_remove_table(tlb, pte);
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 2
|
||||
@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather *
|
||||
tlb->need_flush_all = 1;
|
||||
#endif
|
||||
pagetable_pmd_dtor(ptdesc);
|
||||
- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
|
||||
+ tlb_remove_table(tlb, ptdesc_page(ptdesc));
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 3
|
||||
@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather *
|
||||
|
||||
pagetable_pud_dtor(ptdesc);
|
||||
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
|
||||
- paravirt_tlb_remove_table(tlb, virt_to_page(pud));
|
||||
+ tlb_remove_table(tlb, virt_to_page(pud));
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 4
|
||||
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
|
||||
{
|
||||
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
|
||||
- paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
|
||||
+ tlb_remove_table(tlb, virt_to_page(p4d));
|
||||
}
|
||||
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
|
||||
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
|
||||
--- a/arch/x86/xen/mmu_pv.c
|
||||
+++ b/arch/x86/xen/mmu_pv.c
|
||||
@@ -2137,7 +2137,6 @@ static const typeof(pv_ops) xen_mmu_ops
|
||||
.flush_tlb_kernel = xen_flush_tlb,
|
||||
.flush_tlb_one_user = xen_flush_tlb_one_user,
|
||||
.flush_tlb_multi = xen_flush_tlb_multi,
|
||||
- .tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.pgd_alloc = xen_pgd_alloc,
|
||||
.pgd_free = xen_pgd_free,
|
@ -1,23 +0,0 @@
|
||||
From efde57842082e36ab2e2be5a11c7b06ff9e18b3d Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:04 -0500
|
||||
Subject: x86/mm: add X86_FEATURE_INVLPGB definition.
|
||||
|
||||
Add the INVPLGB CPUID definition, allowing the kernel to recognize
|
||||
whether the CPU supports the INVLPGB instruction.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/cpufeatures.h | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
--- a/arch/x86/include/asm/cpufeatures.h
|
||||
+++ b/arch/x86/include/asm/cpufeatures.h
|
||||
@@ -335,6 +335,7 @@
|
||||
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
|
||||
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
|
||||
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
|
||||
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* "invlpgb" INVLPGB instruction */
|
||||
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
|
||||
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
|
||||
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
|
@ -1,57 +0,0 @@
|
||||
From 98953e10e342ceea1dc877cfb63318fa85879a59 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:05 -0500
|
||||
Subject: x86/mm: get INVLPGB count max from CPUID
|
||||
|
||||
The CPU advertises the maximum number of pages that can be shot down
|
||||
with one INVLPGB instruction in the CPUID data.
|
||||
|
||||
Save that information for later use.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/tlbflush.h | 1 +
|
||||
arch/x86/kernel/cpu/amd.c | 8 ++++++++
|
||||
arch/x86/kernel/setup.c | 4 ++++
|
||||
3 files changed, 13 insertions(+)
|
||||
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -182,6 +182,7 @@ static inline void cr4_init_shadow(void)
|
||||
|
||||
extern unsigned long mmu_cr4_features;
|
||||
extern u32 *trampoline_cr4_features;
|
||||
+extern u16 invlpgb_count_max;
|
||||
|
||||
extern void initialize_tlbstate_and_flush(void);
|
||||
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -1135,6 +1135,14 @@ static void cpu_detect_tlb_amd(struct cp
|
||||
tlb_lli_2m[ENTRIES] = eax & mask;
|
||||
|
||||
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
|
||||
+
|
||||
+ if (c->extended_cpuid_level < 0x80000008)
|
||||
+ return;
|
||||
+
|
||||
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
|
||||
+
|
||||
+ /* Max number of pages INVLPGB can invalidate in one shot */
|
||||
+ invlpgb_count_max = (edx & 0xffff) + 1;
|
||||
}
|
||||
|
||||
static const struct cpu_dev amd_cpu_dev = {
|
||||
--- a/arch/x86/kernel/setup.c
|
||||
+++ b/arch/x86/kernel/setup.c
|
||||
@@ -138,6 +138,10 @@ __visible unsigned long mmu_cr4_features
|
||||
__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_CPU_SUP_AMD
|
||||
+u16 invlpgb_count_max __ro_after_init;
|
||||
+#endif
|
||||
+
|
||||
#ifdef CONFIG_IMA
|
||||
static phys_addr_t ima_kexec_buffer_phys;
|
||||
static size_t ima_kexec_buffer_size;
|
@ -1,121 +0,0 @@
|
||||
From bc9d1fa1bd32dca78f38bd2a8557e7fc638308bd Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:06 -0500
|
||||
Subject: x86/mm: add INVLPGB support code
|
||||
|
||||
Add invlpgb.h with the helper functions and definitions needed to use
|
||||
broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/invlpgb.h | 93 +++++++++++++++++++++++++++++++++
|
||||
arch/x86/include/asm/tlbflush.h | 1 +
|
||||
2 files changed, 94 insertions(+)
|
||||
create mode 100644 arch/x86/include/asm/invlpgb.h
|
||||
|
||||
--- /dev/null
|
||||
+++ b/arch/x86/include/asm/invlpgb.h
|
||||
@@ -0,0 +1,93 @@
|
||||
+/* SPDX-License-Identifier: GPL-2.0 */
|
||||
+#ifndef _ASM_X86_INVLPGB
|
||||
+#define _ASM_X86_INVLPGB
|
||||
+
|
||||
+#include <vdso/bits.h>
|
||||
+
|
||||
+/*
|
||||
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
|
||||
+ *
|
||||
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
|
||||
+ * be done in a parallel fashion.
|
||||
+ *
|
||||
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
|
||||
+ * this CPU have completed.
|
||||
+ */
|
||||
+static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr,
|
||||
+ int extra_count, bool pmd_stride, unsigned long flags)
|
||||
+{
|
||||
+ u64 rax = addr | flags;
|
||||
+ u32 ecx = (pmd_stride << 31) | extra_count;
|
||||
+ u32 edx = (pcid << 16) | asid;
|
||||
+
|
||||
+ asm volatile("invlpgb" : : "a" (rax), "c" (ecx), "d" (edx));
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
|
||||
+ * of the three. For example:
|
||||
+ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address
|
||||
+ * - INVLPGB_PCID: invalidate all TLB entries matching the PCID
|
||||
+ *
|
||||
+ * The first can be used to invalidate (kernel) mappings at a particular
|
||||
+ * address across all processes.
|
||||
+ *
|
||||
+ * The latter invalidates all TLB entries matching a PCID.
|
||||
+ */
|
||||
+#define INVLPGB_VA BIT(0)
|
||||
+#define INVLPGB_PCID BIT(1)
|
||||
+#define INVLPGB_ASID BIT(2)
|
||||
+#define INVLPGB_INCLUDE_GLOBAL BIT(3)
|
||||
+#define INVLPGB_FINAL_ONLY BIT(4)
|
||||
+#define INVLPGB_INCLUDE_NESTED BIT(5)
|
||||
+
|
||||
+/* Flush all mappings for a given pcid and addr, not including globals. */
|
||||
+static inline void invlpgb_flush_user(unsigned long pcid,
|
||||
+ unsigned long addr)
|
||||
+{
|
||||
+ __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
|
||||
+}
|
||||
+
|
||||
+static inline void invlpgb_flush_user_nr(unsigned long pcid, unsigned long addr,
|
||||
+ int nr, bool pmd_stride)
|
||||
+{
|
||||
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings for a given ASID, not including globals. */
|
||||
+static inline void invlpgb_flush_single_asid(unsigned long asid)
|
||||
+{
|
||||
+ __invlpgb(asid, 0, 0, 0, 0, INVLPGB_ASID);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings for a given PCID, not including globals. */
|
||||
+static inline void invlpgb_flush_single_pcid(unsigned long pcid)
|
||||
+{
|
||||
+ __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings, including globals, for all PCIDs. */
|
||||
+static inline void invlpgb_flush_all(void)
|
||||
+{
|
||||
+ __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
|
||||
+}
|
||||
+
|
||||
+/* Flush addr, including globals, for all PCIDs. */
|
||||
+static inline void invlpgb_flush_addr(unsigned long addr, int nr)
|
||||
+{
|
||||
+ __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings for all PCIDs except globals. */
|
||||
+static inline void invlpgb_flush_all_nonglobals(void)
|
||||
+{
|
||||
+ __invlpgb(0, 0, 0, 0, 0, 0);
|
||||
+}
|
||||
+
|
||||
+/* Wait for INVLPGB originated by this CPU to complete. */
|
||||
+static inline void tlbsync(void)
|
||||
+{
|
||||
+ asm volatile("tlbsync");
|
||||
+}
|
||||
+
|
||||
+#endif /* _ASM_X86_INVLPGB */
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/special_insns.h>
|
||||
#include <asm/smp.h>
|
||||
+#include <asm/invlpgb.h>
|
||||
#include <asm/invpcid.h>
|
||||
#include <asm/pti.h>
|
||||
#include <asm/processor-flags.h>
|
@ -1,61 +0,0 @@
|
||||
From ffd834c7140dc5fcaf96161c6d8c4601bb700afe Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:07 -0500
|
||||
Subject: x86/mm: use INVLPGB for kernel TLB flushes
|
||||
|
||||
Use broadcast TLB invalidation for kernel addresses when available.
|
||||
|
||||
This stops us from having to send IPIs for kernel TLB flushes.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/mm/tlb.c | 31 +++++++++++++++++++++++++++++++
|
||||
1 file changed, 31 insertions(+)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1048,6 +1048,32 @@ void flush_tlb_all(void)
|
||||
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
}
|
||||
|
||||
+static void broadcast_kernel_range_flush(unsigned long start, unsigned long end)
|
||||
+{
|
||||
+ unsigned long addr;
|
||||
+ unsigned long maxnr = invlpgb_count_max;
|
||||
+ unsigned long threshold = tlb_single_page_flush_ceiling * maxnr;
|
||||
+
|
||||
+ /*
|
||||
+ * TLBSYNC only waits for flushes originating on the same CPU.
|
||||
+ * Disabling migration allows us to wait on all flushes.
|
||||
+ */
|
||||
+ guard(preempt)();
|
||||
+
|
||||
+ if (end == TLB_FLUSH_ALL ||
|
||||
+ (end - start) > threshold << PAGE_SHIFT) {
|
||||
+ invlpgb_flush_all();
|
||||
+ } else {
|
||||
+ unsigned long nr;
|
||||
+ for (addr = start; addr < end; addr += nr << PAGE_SHIFT) {
|
||||
+ nr = min((end - addr) >> PAGE_SHIFT, maxnr);
|
||||
+ invlpgb_flush_addr(addr, nr);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ tlbsync();
|
||||
+}
|
||||
+
|
||||
static void do_kernel_range_flush(void *info)
|
||||
{
|
||||
struct flush_tlb_info *f = info;
|
||||
@@ -1060,6 +1086,11 @@ static void do_kernel_range_flush(void *
|
||||
|
||||
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
{
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ broadcast_kernel_range_flush(start, end);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
/* Balance as user space task's flush, a bit conservative */
|
||||
if (end == TLB_FLUSH_ALL ||
|
||||
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
|
@ -1,28 +0,0 @@
|
||||
From 13fac8226036456c15c517c1dd77be5109a61da2 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:08 -0500
|
||||
Subject: x86/tlb: use INVLPGB in flush_tlb_all
|
||||
|
||||
The flush_tlb_all() function is not used a whole lot, but we might
|
||||
as well use broadcast TLB flushing there, too.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/mm/tlb.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1045,6 +1045,12 @@ static void do_flush_tlb_all(void *info)
|
||||
void flush_tlb_all(void)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ guard(preempt)();
|
||||
+ invlpgb_flush_all();
|
||||
+ tlbsync();
|
||||
+ return;
|
||||
+ }
|
||||
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
}
|
||||
|
@ -1,36 +0,0 @@
|
||||
From 765d531296765e7fb2888c70cb56c0e25b459231 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:09 -0500
|
||||
Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing
|
||||
|
||||
In the page reclaim code, we only track the CPU(s) where the TLB needs
|
||||
to be flushed, rather than all the individual mappings that may be getting
|
||||
invalidated.
|
||||
|
||||
Use broadcast TLB flushing when that is available.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/mm/tlb.c | 10 +++++++++-
|
||||
1 file changed, 9 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1281,8 +1281,16 @@ EXPORT_SYMBOL_GPL(__flush_tlb_all);
|
||||
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
|
||||
{
|
||||
struct flush_tlb_info *info;
|
||||
+ int cpu;
|
||||
|
||||
- int cpu = get_cpu();
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ guard(preempt)();
|
||||
+ invlpgb_flush_all_nonglobals();
|
||||
+ tlbsync();
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ cpu = get_cpu();
|
||||
|
||||
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
|
||||
TLB_GENERATION_INVALID);
|
@ -1,508 +0,0 @@
|
||||
From 8b23125a3200a330fb407133f33aeb9ad3232603 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:10 -0500
|
||||
Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded
|
||||
processes
|
||||
|
||||
Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3
|
||||
and newer CPUs.
|
||||
|
||||
In order to not exhaust PCID space, and keep TLB flushes local for single
|
||||
threaded processes, we only hand out broadcast ASIDs to processes active on
|
||||
3 or more CPUs, and gradually increase the threshold as broadcast ASID space
|
||||
is depleted.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/mmu.h | 6 +
|
||||
arch/x86/include/asm/mmu_context.h | 12 ++
|
||||
arch/x86/include/asm/tlbflush.h | 17 ++
|
||||
arch/x86/mm/tlb.c | 310 ++++++++++++++++++++++++++++-
|
||||
4 files changed, 336 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/mmu.h
|
||||
+++ b/arch/x86/include/asm/mmu.h
|
||||
@@ -46,6 +46,12 @@ typedef struct {
|
||||
unsigned long flags;
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_CPU_SUP_AMD
|
||||
+ struct list_head broadcast_asid_list;
|
||||
+ u16 broadcast_asid;
|
||||
+ bool asid_transition;
|
||||
+#endif
|
||||
+
|
||||
#ifdef CONFIG_ADDRESS_MASKING
|
||||
/* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
|
||||
unsigned long lam_cr3_mask;
|
||||
--- a/arch/x86/include/asm/mmu_context.h
|
||||
+++ b/arch/x86/include/asm/mmu_context.h
|
||||
@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
|
||||
#define enter_lazy_tlb enter_lazy_tlb
|
||||
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
|
||||
|
||||
+extern void destroy_context_free_broadcast_asid(struct mm_struct *mm);
|
||||
+
|
||||
/*
|
||||
* Init a new mm. Used on mm copies, like at fork()
|
||||
* and on mm's that are brand-new, like at execve().
|
||||
@@ -160,6 +162,13 @@ static inline int init_new_context(struc
|
||||
mm->context.execute_only_pkey = -1;
|
||||
}
|
||||
#endif
|
||||
+
|
||||
+#ifdef CONFIG_CPU_SUP_AMD
|
||||
+ INIT_LIST_HEAD(&mm->context.broadcast_asid_list);
|
||||
+ mm->context.broadcast_asid = 0;
|
||||
+ mm->context.asid_transition = false;
|
||||
+#endif
|
||||
+
|
||||
mm_reset_untag_mask(mm);
|
||||
init_new_context_ldt(mm);
|
||||
return 0;
|
||||
@@ -169,6 +178,9 @@ static inline int init_new_context(struc
|
||||
static inline void destroy_context(struct mm_struct *mm)
|
||||
{
|
||||
destroy_context_ldt(mm);
|
||||
+#ifdef CONFIG_CPU_SUP_AMD
|
||||
+ destroy_context_free_broadcast_asid(mm);
|
||||
+#endif
|
||||
}
|
||||
|
||||
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -65,6 +65,23 @@ static inline void cr4_clear_bits(unsign
|
||||
*/
|
||||
#define TLB_NR_DYN_ASIDS 6
|
||||
|
||||
+#ifdef CONFIG_CPU_SUP_AMD
|
||||
+#define is_dyn_asid(asid) (asid) < TLB_NR_DYN_ASIDS
|
||||
+#define is_broadcast_asid(asid) (asid) >= TLB_NR_DYN_ASIDS
|
||||
+#define in_asid_transition(info) (info->mm && info->mm->context.asid_transition)
|
||||
+#define mm_broadcast_asid(mm) (mm->context.broadcast_asid)
|
||||
+#else
|
||||
+#define is_dyn_asid(asid) true
|
||||
+#define is_broadcast_asid(asid) false
|
||||
+#define in_asid_transition(info) false
|
||||
+#define mm_broadcast_asid(mm) 0
|
||||
+
|
||||
+inline bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
struct tlb_context {
|
||||
u64 ctx_id;
|
||||
u64 tlb_gen;
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -74,13 +74,15 @@
|
||||
* use different names for each of them:
|
||||
*
|
||||
* ASID - [0, TLB_NR_DYN_ASIDS-1]
|
||||
- * the canonical identifier for an mm
|
||||
+ * the canonical identifier for an mm, dynamically allocated on each CPU
|
||||
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
|
||||
+ * the canonical, global identifier for an mm, identical across all CPUs
|
||||
*
|
||||
- * kPCID - [1, TLB_NR_DYN_ASIDS]
|
||||
+ * kPCID - [1, MAX_ASID_AVAILABLE]
|
||||
* the value we write into the PCID part of CR3; corresponds to the
|
||||
* ASID+1, because PCID 0 is special.
|
||||
*
|
||||
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
|
||||
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
|
||||
* for KPTI each mm has two address spaces and thus needs two
|
||||
* PCID values, but we can still do with a single ASID denomination
|
||||
* for each mm. Corresponds to kPCID + 2048.
|
||||
@@ -225,6 +227,18 @@ static void choose_new_asid(struct mm_st
|
||||
return;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * TLB consistency for this ASID is maintained with INVLPGB;
|
||||
+ * TLB flushes happen even while the process isn't running.
|
||||
+ */
|
||||
+#ifdef CONFIG_CPU_SUP_AMD
|
||||
+ if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_broadcast_asid(next)) {
|
||||
+ *new_asid = mm_broadcast_asid(next);
|
||||
+ *need_flush = false;
|
||||
+ return;
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
if (this_cpu_read(cpu_tlbstate.invalidate_other))
|
||||
clear_asid_other();
|
||||
|
||||
@@ -251,6 +265,245 @@ static void choose_new_asid(struct mm_st
|
||||
*need_flush = true;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_CPU_SUP_AMD
|
||||
+/*
|
||||
+ * Logic for AMD INVLPGB support.
|
||||
+ */
|
||||
+static DEFINE_RAW_SPINLOCK(broadcast_asid_lock);
|
||||
+static u16 last_broadcast_asid = TLB_NR_DYN_ASIDS;
|
||||
+static DECLARE_BITMAP(broadcast_asid_used, MAX_ASID_AVAILABLE) = { 0 };
|
||||
+static LIST_HEAD(broadcast_asid_list);
|
||||
+static int broadcast_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
|
||||
+
|
||||
+static void reset_broadcast_asid_space(void)
|
||||
+{
|
||||
+ mm_context_t *context;
|
||||
+
|
||||
+ lockdep_assert_held(&broadcast_asid_lock);
|
||||
+
|
||||
+ /*
|
||||
+ * Flush once when we wrap around the ASID space, so we won't need
|
||||
+ * to flush every time we allocate an ASID for boradcast flushing.
|
||||
+ */
|
||||
+ invlpgb_flush_all_nonglobals();
|
||||
+ tlbsync();
|
||||
+
|
||||
+ /*
|
||||
+ * Leave the currently used broadcast ASIDs set in the bitmap, since
|
||||
+ * those cannot be reused before the next wraparound and flush..
|
||||
+ */
|
||||
+ bitmap_clear(broadcast_asid_used, 0, MAX_ASID_AVAILABLE);
|
||||
+ list_for_each_entry(context, &broadcast_asid_list, broadcast_asid_list)
|
||||
+ __set_bit(context->broadcast_asid, broadcast_asid_used);
|
||||
+
|
||||
+ last_broadcast_asid = TLB_NR_DYN_ASIDS;
|
||||
+}
|
||||
+
|
||||
+static u16 get_broadcast_asid(void)
|
||||
+{
|
||||
+ lockdep_assert_held(&broadcast_asid_lock);
|
||||
+
|
||||
+ do {
|
||||
+ u16 start = last_broadcast_asid;
|
||||
+ u16 asid = find_next_zero_bit(broadcast_asid_used, MAX_ASID_AVAILABLE, start);
|
||||
+
|
||||
+ if (asid >= MAX_ASID_AVAILABLE) {
|
||||
+ reset_broadcast_asid_space();
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ /* Try claiming this broadcast ASID. */
|
||||
+ if (!test_and_set_bit(asid, broadcast_asid_used)) {
|
||||
+ last_broadcast_asid = asid;
|
||||
+ return asid;
|
||||
+ }
|
||||
+ } while (1);
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Returns true if the mm is transitioning from a CPU-local ASID to a broadcast
|
||||
+ * (INVLPGB) ASID, or the other way around.
|
||||
+ */
|
||||
+static bool needs_broadcast_asid_reload(struct mm_struct *next, u16 prev_asid)
|
||||
+{
|
||||
+ u16 broadcast_asid = mm_broadcast_asid(next);
|
||||
+
|
||||
+ if (broadcast_asid && prev_asid != broadcast_asid)
|
||||
+ return true;
|
||||
+
|
||||
+ if (!broadcast_asid && is_broadcast_asid(prev_asid))
|
||||
+ return true;
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+void destroy_context_free_broadcast_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!mm->context.broadcast_asid)
|
||||
+ return;
|
||||
+
|
||||
+ guard(raw_spinlock_irqsave)(&broadcast_asid_lock);
|
||||
+ mm->context.broadcast_asid = 0;
|
||||
+ list_del(&mm->context.broadcast_asid_list);
|
||||
+ broadcast_asid_available++;
|
||||
+}
|
||||
+
|
||||
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
|
||||
+{
|
||||
+ int count = 0;
|
||||
+ int cpu;
|
||||
+
|
||||
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
|
||||
+ return false;
|
||||
+
|
||||
+ for_each_cpu(cpu, mm_cpumask(mm)) {
|
||||
+ /* Skip the CPUs that aren't really running this process. */
|
||||
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
|
||||
+ continue;
|
||||
+
|
||||
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
|
||||
+ continue;
|
||||
+
|
||||
+ if (++count > threshold)
|
||||
+ return true;
|
||||
+ }
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Assign a broadcast ASID to the current process, protecting against
|
||||
+ * races between multiple threads in the process.
|
||||
+ */
|
||||
+static void use_broadcast_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ guard(raw_spinlock_irqsave)(&broadcast_asid_lock);
|
||||
+
|
||||
+ /* This process is already using broadcast TLB invalidation. */
|
||||
+ if (mm->context.broadcast_asid)
|
||||
+ return;
|
||||
+
|
||||
+ mm->context.broadcast_asid = get_broadcast_asid();
|
||||
+ mm->context.asid_transition = true;
|
||||
+ list_add(&mm->context.broadcast_asid_list, &broadcast_asid_list);
|
||||
+ broadcast_asid_available--;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Figure out whether to assign a broadcast (global) ASID to a process.
|
||||
+ * We vary the threshold by how empty or full broadcast ASID space is.
|
||||
+ * 1/4 full: >= 4 active threads
|
||||
+ * 1/2 full: >= 8 active threads
|
||||
+ * 3/4 full: >= 16 active threads
|
||||
+ * 7/8 full: >= 32 active threads
|
||||
+ * etc
|
||||
+ *
|
||||
+ * This way we should never exhaust the broadcast ASID space, even on very
|
||||
+ * large systems, and the processes with the largest number of active
|
||||
+ * threads should be able to use broadcast TLB invalidation.
|
||||
+ */
|
||||
+#define HALFFULL_THRESHOLD 8
|
||||
+static bool meets_broadcast_asid_threshold(struct mm_struct *mm)
|
||||
+{
|
||||
+ int avail = broadcast_asid_available;
|
||||
+ int threshold = HALFFULL_THRESHOLD;
|
||||
+
|
||||
+ if (!avail)
|
||||
+ return false;
|
||||
+
|
||||
+ if (avail > MAX_ASID_AVAILABLE * 3 / 4) {
|
||||
+ threshold = HALFFULL_THRESHOLD / 4;
|
||||
+ } else if (avail > MAX_ASID_AVAILABLE / 2) {
|
||||
+ threshold = HALFFULL_THRESHOLD / 2;
|
||||
+ } else if (avail < MAX_ASID_AVAILABLE / 3) {
|
||||
+ do {
|
||||
+ avail *= 2;
|
||||
+ threshold *= 2;
|
||||
+ } while ((avail + threshold) < MAX_ASID_AVAILABLE / 2);
|
||||
+ }
|
||||
+
|
||||
+ return mm_active_cpus_exceeds(mm, threshold);
|
||||
+}
|
||||
+
|
||||
+static void count_tlb_flush(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!static_cpu_has(X86_FEATURE_INVLPGB))
|
||||
+ return;
|
||||
+
|
||||
+ /* Check every once in a while. */
|
||||
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
|
||||
+ return;
|
||||
+
|
||||
+ if (meets_broadcast_asid_threshold(mm))
|
||||
+ use_broadcast_asid(mm);
|
||||
+}
|
||||
+
|
||||
+static void finish_asid_transition(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ struct mm_struct *mm = info->mm;
|
||||
+ int bc_asid = mm_broadcast_asid(mm);
|
||||
+ int cpu;
|
||||
+
|
||||
+ if (!mm->context.asid_transition)
|
||||
+ return;
|
||||
+
|
||||
+ for_each_cpu(cpu, mm_cpumask(mm)) {
|
||||
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
|
||||
+ continue;
|
||||
+
|
||||
+ /*
|
||||
+ * If at least one CPU is not using the broadcast ASID yet,
|
||||
+ * send a TLB flush IPI. The IPI should cause stragglers
|
||||
+ * to transition soon.
|
||||
+ */
|
||||
+ if (per_cpu(cpu_tlbstate.loaded_mm_asid, cpu) != bc_asid) {
|
||||
+ flush_tlb_multi(mm_cpumask(info->mm), info);
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ /* All the CPUs running this process are using the broadcast ASID. */
|
||||
+ mm->context.asid_transition = 0;
|
||||
+}
|
||||
+
|
||||
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ bool pmd = info->stride_shift == PMD_SHIFT;
|
||||
+ unsigned long maxnr = invlpgb_count_max;
|
||||
+ unsigned long asid = info->mm->context.broadcast_asid;
|
||||
+ unsigned long addr = info->start;
|
||||
+ unsigned long nr;
|
||||
+
|
||||
+ /* Flushing multiple pages at once is not supported with 1GB pages. */
|
||||
+ if (info->stride_shift > PMD_SHIFT)
|
||||
+ maxnr = 1;
|
||||
+
|
||||
+ if (info->end == TLB_FLUSH_ALL) {
|
||||
+ invlpgb_flush_single_pcid(kern_pcid(asid));
|
||||
+ /* Do any CPUs supporting INVLPGB need PTI? */
|
||||
+ if (static_cpu_has(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_single_pcid(user_pcid(asid));
|
||||
+ } else do {
|
||||
+ /*
|
||||
+ * Calculate how many pages can be flushed at once; if the
|
||||
+ * remainder of the range is less than one page, flush one.
|
||||
+ */
|
||||
+ nr = min(maxnr, (info->end - addr) >> info->stride_shift);
|
||||
+ nr = max(nr, 1);
|
||||
+
|
||||
+ invlpgb_flush_user_nr(kern_pcid(asid), addr, nr, pmd);
|
||||
+ /* Do any CPUs supporting INVLPGB need PTI? */
|
||||
+ if (static_cpu_has(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_user_nr(user_pcid(asid), addr, nr, pmd);
|
||||
+ addr += nr << info->stride_shift;
|
||||
+ } while (addr < info->end);
|
||||
+
|
||||
+ finish_asid_transition(info);
|
||||
+
|
||||
+ /* Wait for the INVLPGBs kicked off above to finish. */
|
||||
+ tlbsync();
|
||||
+}
|
||||
+#endif /* CONFIG_CPU_SUP_AMD */
|
||||
+
|
||||
/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
@@ -556,8 +809,9 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
*/
|
||||
if (prev == next) {
|
||||
/* Not actually switching mm's */
|
||||
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
- next->context.ctx_id);
|
||||
+ if (is_dyn_asid(prev_asid))
|
||||
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
+ next->context.ctx_id);
|
||||
|
||||
/*
|
||||
* If this races with another thread that enables lam, 'new_lam'
|
||||
@@ -574,6 +828,23 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
/*
|
||||
+ * Check if the current mm is transitioning to a new ASID.
|
||||
+ */
|
||||
+ if (needs_broadcast_asid_reload(next, prev_asid)) {
|
||||
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
+
|
||||
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
+ goto reload_tlb;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Broadcast TLB invalidation keeps this PCID up to date
|
||||
+ * all the time.
|
||||
+ */
|
||||
+ if (is_broadcast_asid(prev_asid))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
* If the CPU is not in lazy TLB mode, we are just switching
|
||||
* from one thread in a process to another thread in the same
|
||||
* process. No TLB flush required.
|
||||
@@ -629,8 +900,10 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
barrier();
|
||||
}
|
||||
|
||||
+reload_tlb:
|
||||
new_lam = mm_lam_cr3_mask(next);
|
||||
if (need_flush) {
|
||||
+ VM_BUG_ON(is_broadcast_asid(new_asid));
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
|
||||
@@ -749,7 +1022,7 @@ static void flush_tlb_func(void *info)
|
||||
const struct flush_tlb_info *f = info;
|
||||
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
||||
+ u64 local_tlb_gen;
|
||||
bool local = smp_processor_id() == f->initiating_cpu;
|
||||
unsigned long nr_invalidate = 0;
|
||||
u64 mm_tlb_gen;
|
||||
@@ -769,6 +1042,16 @@ static void flush_tlb_func(void *info)
|
||||
if (unlikely(loaded_mm == &init_mm))
|
||||
return;
|
||||
|
||||
+ /* Reload the ASID if transitioning into or out of a broadcast ASID */
|
||||
+ if (needs_broadcast_asid_reload(loaded_mm, loaded_mm_asid)) {
|
||||
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
|
||||
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
+ }
|
||||
+
|
||||
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
|
||||
+ if (is_broadcast_asid(loaded_mm_asid))
|
||||
+ return;
|
||||
+
|
||||
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
|
||||
loaded_mm->context.ctx_id);
|
||||
|
||||
@@ -786,6 +1069,8 @@ static void flush_tlb_func(void *info)
|
||||
return;
|
||||
}
|
||||
|
||||
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
||||
+
|
||||
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
|
||||
f->new_tlb_gen <= local_tlb_gen)) {
|
||||
/*
|
||||
@@ -926,7 +1211,7 @@ STATIC_NOPV void native_flush_tlb_multi(
|
||||
* up on the new contents of what used to be page tables, while
|
||||
* doing a speculative memory access.
|
||||
*/
|
||||
- if (info->freed_tables)
|
||||
+ if (info->freed_tables || in_asid_transition(info))
|
||||
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
|
||||
else
|
||||
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
|
||||
@@ -998,14 +1283,18 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
bool freed_tables)
|
||||
{
|
||||
struct flush_tlb_info *info;
|
||||
+ unsigned long threshold = tlb_single_page_flush_ceiling;
|
||||
u64 new_tlb_gen;
|
||||
int cpu;
|
||||
|
||||
+ if (static_cpu_has(X86_FEATURE_INVLPGB))
|
||||
+ threshold *= invlpgb_count_max;
|
||||
+
|
||||
cpu = get_cpu();
|
||||
|
||||
/* Should we flush just the requested range? */
|
||||
if ((end == TLB_FLUSH_ALL) ||
|
||||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
|
||||
+ ((end - start) >> stride_shift) > threshold) {
|
||||
start = 0;
|
||||
end = TLB_FLUSH_ALL;
|
||||
}
|
||||
@@ -1021,8 +1310,11 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
|
||||
+ if (IS_ENABLED(CONFIG_CPU_SUP_AMD) && mm_broadcast_asid(mm)) {
|
||||
+ broadcast_tlb_flush(info);
|
||||
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
|
||||
flush_tlb_multi(mm_cpumask(mm), info);
|
||||
+ count_tlb_flush(mm);
|
||||
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
|
||||
lockdep_assert_irqs_enabled();
|
||||
local_irq_disable();
|
@ -1,126 +0,0 @@
|
||||
From 1767a2786ebbe3451f973df44485309c2a8fd8a5 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:11 -0500
|
||||
Subject: x86,tlb: do targeted broadcast flushing from tlbbatch code
|
||||
|
||||
Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
|
||||
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
|
||||
|
||||
This also allows us to avoid adding the CPUs of processes using broadcast
|
||||
flushing to the batch->cpumask, and will hopefully further reduce TLB
|
||||
flushing from the reclaim and compaction paths.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/tlbbatch.h | 1 +
|
||||
arch/x86/include/asm/tlbflush.h | 12 +++------
|
||||
arch/x86/mm/tlb.c | 48 ++++++++++++++++++++++++++-------
|
||||
3 files changed, 42 insertions(+), 19 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlbbatch.h
|
||||
+++ b/arch/x86/include/asm/tlbbatch.h
|
||||
@@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch {
|
||||
* the PFNs being flushed..
|
||||
*/
|
||||
struct cpumask cpumask;
|
||||
+ bool used_invlpgb;
|
||||
};
|
||||
|
||||
#endif /* _ARCH_X86_TLBBATCH_H */
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -296,21 +296,15 @@ static inline u64 inc_mm_tlb_gen(struct
|
||||
return atomic64_inc_return(&mm->context.tlb_gen);
|
||||
}
|
||||
|
||||
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
- struct mm_struct *mm,
|
||||
- unsigned long uaddr)
|
||||
-{
|
||||
- inc_mm_tlb_gen(mm);
|
||||
- cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
|
||||
- mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
|
||||
-}
|
||||
-
|
||||
static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
|
||||
{
|
||||
flush_tlb_mm(mm);
|
||||
}
|
||||
|
||||
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
|
||||
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
+ struct mm_struct *mm,
|
||||
+ unsigned long uaddr);
|
||||
|
||||
static inline bool pte_flags_need_flush(unsigned long oldflags,
|
||||
unsigned long newflags,
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1573,16 +1573,7 @@ EXPORT_SYMBOL_GPL(__flush_tlb_all);
|
||||
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
|
||||
{
|
||||
struct flush_tlb_info *info;
|
||||
- int cpu;
|
||||
-
|
||||
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
- guard(preempt)();
|
||||
- invlpgb_flush_all_nonglobals();
|
||||
- tlbsync();
|
||||
- return;
|
||||
- }
|
||||
-
|
||||
- cpu = get_cpu();
|
||||
+ int cpu = get_cpu();
|
||||
|
||||
info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
|
||||
TLB_GENERATION_INVALID);
|
||||
@@ -1600,12 +1591,49 @@ void arch_tlbbatch_flush(struct arch_tlb
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * If we issued (asynchronous) INVLPGB flushes, wait for them here.
|
||||
+ * The cpumask above contains only CPUs that were running tasks
|
||||
+ * not using broadcast TLB flushing.
|
||||
+ */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
|
||||
+ tlbsync();
|
||||
+ migrate_enable();
|
||||
+ batch->used_invlpgb = false;
|
||||
+ }
|
||||
+
|
||||
cpumask_clear(&batch->cpumask);
|
||||
|
||||
put_flush_tlb_info();
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
+ struct mm_struct *mm,
|
||||
+ unsigned long uaddr)
|
||||
+{
|
||||
+ if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_broadcast_asid(mm)) {
|
||||
+ u16 asid = mm_broadcast_asid(mm);
|
||||
+ /*
|
||||
+ * Queue up an asynchronous invalidation. The corresponding
|
||||
+ * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
|
||||
+ * on the same CPU.
|
||||
+ */
|
||||
+ if (!batch->used_invlpgb) {
|
||||
+ batch->used_invlpgb = true;
|
||||
+ migrate_disable();
|
||||
+ }
|
||||
+ invlpgb_flush_user_nr(kern_pcid(asid), uaddr, 1, 0);
|
||||
+ /* Do any CPUs supporting INVLPGB need PTI? */
|
||||
+ if (static_cpu_has(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_user_nr(user_pcid(asid), uaddr, 1, 0);
|
||||
+ } else {
|
||||
+ inc_mm_tlb_gen(mm);
|
||||
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
|
||||
+ }
|
||||
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Blindly accessing user memory from NMI context can be dangerous
|
||||
* if we're in the middle of switching the current user task or
|
@ -1,82 +0,0 @@
|
||||
From 13faf551d1a146ed18c448babe1953def4ed3d56 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:12 -0500
|
||||
Subject: x86/mm: enable AMD translation cache extensions
|
||||
|
||||
With AMD TCE (translation cache extensions) only the intermediate mappings
|
||||
that cover the address range zapped by INVLPG / INVLPGB get invalidated,
|
||||
rather than all intermediate mappings getting zapped at every TLB invalidation.
|
||||
|
||||
This can help reduce the TLB miss rate, by keeping more intermediate
|
||||
mappings in the cache.
|
||||
|
||||
>From the AMD manual:
|
||||
|
||||
Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit
|
||||
to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on
|
||||
TLB entries. When this bit is 0, these instructions remove the target PTE
|
||||
from the TLB as well as all upper-level table entries that are cached
|
||||
in the TLB, whether or not they are associated with the target PTE.
|
||||
When this bit is set, these instructions will remove the target PTE and
|
||||
only those upper-level entries that lead to the target PTE in
|
||||
the page table hierarchy, leaving unrelated upper-level entries intact.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/kernel/cpu/amd.c | 8 ++++++++
|
||||
arch/x86/mm/tlb.c | 10 +++++++---
|
||||
2 files changed, 15 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -1143,6 +1143,14 @@ static void cpu_detect_tlb_amd(struct cp
|
||||
|
||||
/* Max number of pages INVLPGB can invalidate in one shot */
|
||||
invlpgb_count_max = (edx & 0xffff) + 1;
|
||||
+
|
||||
+ /* If supported, enable translation cache extensions (TCE) */
|
||||
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||
+ if (ecx & BIT(17)) {
|
||||
+ u64 msr = native_read_msr(MSR_EFER);;
|
||||
+ msr |= BIT(15);
|
||||
+ wrmsrl(MSR_EFER, msr);
|
||||
+ }
|
||||
}
|
||||
|
||||
static const struct cpu_dev amd_cpu_dev = {
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -477,7 +477,7 @@ static void broadcast_tlb_flush(struct f
|
||||
if (info->stride_shift > PMD_SHIFT)
|
||||
maxnr = 1;
|
||||
|
||||
- if (info->end == TLB_FLUSH_ALL) {
|
||||
+ if (info->end == TLB_FLUSH_ALL || info->freed_tables) {
|
||||
invlpgb_flush_single_pcid(kern_pcid(asid));
|
||||
/* Do any CPUs supporting INVLPGB need PTI? */
|
||||
if (static_cpu_has(X86_FEATURE_PTI))
|
||||
@@ -1110,7 +1110,7 @@ static void flush_tlb_func(void *info)
|
||||
*
|
||||
* The only question is whether to do a full or partial flush.
|
||||
*
|
||||
- * We do a partial flush if requested and two extra conditions
|
||||
+ * We do a partial flush if requested and three extra conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
|
||||
@@ -1137,10 +1137,14 @@ static void flush_tlb_func(void *info)
|
||||
* date. By doing a full flush instead, we can increase
|
||||
* local_tlb_gen all the way to mm_tlb_gen and we can probably
|
||||
* avoid another flush in the very near future.
|
||||
+ *
|
||||
+ * 3. No page tables were freed. If page tables were freed, a full
|
||||
+ * flush ensures intermediate translations in the TLB get flushed.
|
||||
*/
|
||||
if (f->end != TLB_FLUSH_ALL &&
|
||||
f->new_tlb_gen == local_tlb_gen + 1 &&
|
||||
- f->new_tlb_gen == mm_tlb_gen) {
|
||||
+ f->new_tlb_gen == mm_tlb_gen &&
|
||||
+ !f->freed_tables) {
|
||||
/* Partial flush */
|
||||
unsigned long addr = f->start;
|
||||
|
@ -1,28 +0,0 @@
|
||||
From 2fc0be5fbcee1a62162b699451bb94f90ec64244 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Mon, 30 Dec 2024 12:53:13 -0500
|
||||
Subject: x86/mm: only invalidate final translations with INVLPGB
|
||||
|
||||
Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB.
|
||||
This way only leaf mappings get removed from the TLB, leaving intermediate
|
||||
translations cached.
|
||||
|
||||
On the (rare) occasions where we free page tables we do a full flush,
|
||||
ensuring intermediate translations get flushed from the TLB.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/invlpgb.h | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/include/asm/invlpgb.h
|
||||
+++ b/arch/x86/include/asm/invlpgb.h
|
||||
@@ -51,7 +51,7 @@ static inline void invlpgb_flush_user(un
|
||||
static inline void invlpgb_flush_user_nr(unsigned long pcid, unsigned long addr,
|
||||
int nr, bool pmd_stride)
|
||||
{
|
||||
- __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
|
||||
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA | INVLPGB_FINAL_ONLY);
|
||||
}
|
||||
|
||||
/* Flush all mappings for a given ASID, not including globals. */
|
@ -1,92 +0,0 @@
|
||||
From a3ff46a157cadb29349c5b388fc70804c351e561 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Thu, 19 Dec 2024 15:32:53 -0500
|
||||
Subject: mm: remove unnecessary calls to lru_add_drain
|
||||
|
||||
There seem to be several categories of calls to lru_add_drain
|
||||
and lru_add_drain_all.
|
||||
|
||||
The first are code paths that recently allocated, swapped in,
|
||||
or otherwise processed a batch of pages, and want them all on
|
||||
the LRU. These drain pages that were recently allocated,
|
||||
probably on the local CPU.
|
||||
|
||||
A second category are code paths that are actively trying to
|
||||
reclaim, migrate, or offline memory. These often use lru_add_drain_all,
|
||||
to drain the caches on all CPUs.
|
||||
|
||||
However, there also seem to be some other callers where we
|
||||
aren't really doing either. They are calling lru_add_drain(),
|
||||
despite operating on pages that may have been allocated
|
||||
long ago, and quite possibly on different CPUs.
|
||||
|
||||
Those calls are not likely to be effective at anything but
|
||||
creating lock contention on the LRU locks.
|
||||
|
||||
Remove the lru_add_drain calls in the latter category.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Suggested-by: David Hildenbrand <david@redhat.com>
|
||||
---
|
||||
mm/memory.c | 1 -
|
||||
mm/mmap.c | 2 --
|
||||
mm/swap_state.c | 1 -
|
||||
mm/vma.c | 2 --
|
||||
4 files changed, 6 deletions(-)
|
||||
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are
|
||||
struct mmu_notifier_range range;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
- lru_add_drain();
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
|
||||
address, end);
|
||||
hugetlb_zap_begin(vma, &range.start, &range.end);
|
||||
--- a/mm/mmap.c
|
||||
+++ b/mm/mmap.c
|
||||
@@ -1931,7 +1931,6 @@ void exit_mmap(struct mm_struct *mm)
|
||||
goto destroy;
|
||||
}
|
||||
|
||||
- lru_add_drain();
|
||||
flush_cache_mm(mm);
|
||||
tlb_gather_mmu_fullmm(&tlb, mm);
|
||||
/* update_hiwater_rss(mm) here? but nobody should be looking */
|
||||
@@ -2374,7 +2373,6 @@ int relocate_vma_down(struct vm_area_str
|
||||
vma, new_start, length, false, true))
|
||||
return -ENOMEM;
|
||||
|
||||
- lru_add_drain();
|
||||
tlb_gather_mmu(&tlb, mm);
|
||||
next = vma_next(&vmi);
|
||||
if (new_end > old_start) {
|
||||
--- a/mm/swap_state.c
|
||||
+++ b/mm/swap_state.c
|
||||
@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en
|
||||
struct folio_batch folios;
|
||||
unsigned int refs[PAGEVEC_SIZE];
|
||||
|
||||
- lru_add_drain();
|
||||
folio_batch_init(&folios);
|
||||
for (int i = 0; i < nr; i++) {
|
||||
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
|
||||
--- a/mm/vma.c
|
||||
+++ b/mm/vma.c
|
||||
@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas,
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
- lru_add_drain();
|
||||
tlb_gather_mmu(&tlb, mm);
|
||||
update_hiwater_rss(mm);
|
||||
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
|
||||
@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct
|
||||
* were isolated before we downgraded mmap_lock.
|
||||
*/
|
||||
mas_set(mas_detach, 1);
|
||||
- lru_add_drain();
|
||||
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
|
||||
update_hiwater_rss(vms->vma->vm_mm);
|
||||
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
|
15
debian/patches/series
vendored
15
debian/patches/series
vendored
@ -173,20 +173,6 @@ patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.pat
|
||||
patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
|
||||
patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
|
||||
|
||||
patchset-pf/invlpgb/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
|
||||
patchset-pf/invlpgb/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
|
||||
patchset-pf/invlpgb/0003-x86-mm-add-X86_FEATURE_INVLPGB-definition.patch
|
||||
patchset-pf/invlpgb/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
|
||||
patchset-pf/invlpgb/0005-x86-mm-add-INVLPGB-support-code.patch
|
||||
patchset-pf/invlpgb/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
|
||||
patchset-pf/invlpgb/0007-x86-tlb-use-INVLPGB-in-flush_tlb_all.patch
|
||||
patchset-pf/invlpgb/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
|
||||
patchset-pf/invlpgb/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
|
||||
patchset-pf/invlpgb/0010-x86-tlb-do-targeted-broadcast-flushing-from-tlbbatch.patch
|
||||
patchset-pf/invlpgb/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
|
||||
patchset-pf/invlpgb/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
|
||||
patchset-pf/invlpgb/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
|
||||
|
||||
patchset-pf/pksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch
|
||||
|
||||
patchset-pf/xfs/0001-xfs-fix-chown-with-rt-quota.patch
|
||||
@ -261,6 +247,5 @@ patchset-zen/sauce/0024-ZEN-kernel-Kconfig.preempt-Remove-EXPERT-conditional.pat
|
||||
|
||||
patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch
|
||||
patchset-pf/fixes/0002-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch
|
||||
patchset-pf/fixes/0003-USB-core-Disable-LPM-only-for-non-suspended-ports.patch
|
||||
|
||||
patchset-zen/fixes/0001-futex-improve-user-space-accesses.patch
|
||||
|
Loading…
x
Reference in New Issue
Block a user