release 6.12.11
This commit is contained in:
parent
5b35bedab1
commit
dbf57fda83
7
debian/changelog
vendored
7
debian/changelog
vendored
@ -1,3 +1,10 @@
|
||||
linux (6.12.11-1) sid; urgency=medium
|
||||
|
||||
* New upstream stable update:
|
||||
https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.12.11
|
||||
|
||||
-- Konstantin Demin <rockdrilla@gmail.com> Tue, 28 Jan 2025 09:10:17 +0300
|
||||
|
||||
linux (6.12.10-1) sid; urgency=medium
|
||||
|
||||
* New upstream stable update:
|
||||
|
2
debian/config/amd64/config.cloud
vendored
2
debian/config/amd64/config.cloud
vendored
@ -2491,8 +2491,6 @@ CONFIG_KEXEC_CORE=y
|
||||
CONFIG_LZ4HC_COMPRESS=m
|
||||
CONFIG_LZ4_COMPRESS=m
|
||||
CONFIG_MFD_CORE=m
|
||||
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
|
||||
CONFIG_MMU_GATHER_TABLE_FREE=y
|
||||
CONFIG_ND_BTT=m
|
||||
CONFIG_ND_PFN=m
|
||||
CONFIG_NETFS_SUPPORT=m
|
||||
|
2
debian/config/amd64/config.vm
vendored
2
debian/config/amd64/config.vm
vendored
@ -4064,8 +4064,6 @@ CONFIG_LZ4_COMPRESS=m
|
||||
CONFIG_MAPPING_DIRTY_HELPERS=y
|
||||
CONFIG_MCTP_FLOWS=y
|
||||
CONFIG_MFD_CORE=m
|
||||
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
|
||||
CONFIG_MMU_GATHER_TABLE_FREE=y
|
||||
CONFIG_MOUSE_PS2_SMBUS=y
|
||||
CONFIG_ND_BTT=m
|
||||
CONFIG_ND_PFN=m
|
||||
|
3
debian/config/config
vendored
3
debian/config/config
vendored
@ -3945,6 +3945,8 @@ CONFIG_MLX4_CORE=m
|
||||
CONFIG_MMCONF_FAM10H=y
|
||||
CONFIG_MMU=y
|
||||
CONFIG_MMU_GATHER_MERGE_VMAS=y
|
||||
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
|
||||
CONFIG_MMU_GATHER_TABLE_FREE=y
|
||||
CONFIG_MMU_LAZY_TLB_REFCOUNT=y
|
||||
CONFIG_MMU_NOTIFIER=y
|
||||
CONFIG_MODULES_TREE_LOOKUP=y
|
||||
@ -4186,6 +4188,7 @@ CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m
|
||||
CONFIG_X86=y
|
||||
CONFIG_X86_64=y
|
||||
CONFIG_X86_64_SMP=y
|
||||
CONFIG_X86_BROADCAST_TLB_FLUSH=y
|
||||
CONFIG_X86_CMOV=y
|
||||
CONFIG_X86_CMPXCHG64=y
|
||||
CONFIG_X86_DEBUGCTLMSR=y
|
||||
|
@ -1,27 +0,0 @@
|
||||
From: Ben Hutchings <ben@decadent.org.uk>
|
||||
Subject: cdc_ncm,cdc_mbim: Use NCM by default
|
||||
Date: Sun, 31 Mar 2013 03:58:04 +0100
|
||||
Forwarded: not-needed
|
||||
|
||||
Devices that support both NCM and MBIM modes should be kept in NCM
|
||||
mode unless there is userland support for MBIM.
|
||||
|
||||
Set the default value of cdc_ncm.prefer_mbim to false and leave it to
|
||||
userland (modem-manager) to override this with a modprobe.conf file
|
||||
once it's ready to speak MBIM.
|
||||
|
||||
---
|
||||
--- a/drivers/net/usb/cdc_ncm.c
|
||||
+++ b/drivers/net/usb/cdc_ncm.c
|
||||
@@ -54,11 +54,7 @@
|
||||
#include <linux/usb/cdc.h>
|
||||
#include <linux/usb/cdc_ncm.h>
|
||||
|
||||
-#if IS_ENABLED(CONFIG_USB_NET_CDC_MBIM)
|
||||
-static bool prefer_mbim = true;
|
||||
-#else
|
||||
static bool prefer_mbim;
|
||||
-#endif
|
||||
module_param(prefer_mbim, bool, 0644);
|
||||
MODULE_PARM_DESC(prefer_mbim, "Prefer MBIM setting on dual NCM/MBIM functions");
|
||||
|
@ -22,7 +22,7 @@ Export the currently un-exported symbols it depends on.
|
||||
|
||||
--- a/fs/file.c
|
||||
+++ b/fs/file.c
|
||||
@@ -792,6 +792,7 @@ struct file *file_close_fd(unsigned int
|
||||
@@ -793,6 +793,7 @@ struct file *file_close_fd(unsigned int
|
||||
|
||||
return file;
|
||||
}
|
||||
|
@ -47,7 +47,7 @@ Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
.reflect = menu_reflect,
|
||||
--- a/drivers/cpuidle/governors/teo.c
|
||||
+++ b/drivers/cpuidle/governors/teo.c
|
||||
@@ -537,7 +537,7 @@ static int teo_enable_device(struct cpui
|
||||
@@ -542,7 +542,7 @@ static int teo_enable_device(struct cpui
|
||||
|
||||
static struct cpuidle_governor teo_governor = {
|
||||
.name = "teo",
|
||||
|
123
debian/patches/patchset-zen/nvlpgb-v7/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
vendored
Normal file
123
debian/patches/patchset-zen/nvlpgb-v7/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
vendored
Normal file
@ -0,0 +1,123 @@
|
||||
From 6cb30d7518301094b9c7397a24a22cf538a1d64c Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:20 -0500
|
||||
Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional
|
||||
|
||||
Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
|
||||
paravirt, and not when running on bare metal.
|
||||
|
||||
There is no real good reason to do things differently for
|
||||
each setup. Make them all the same.
|
||||
|
||||
Currently get_user_pages_fast synchronizes against page table
|
||||
freeing in two different ways:
|
||||
- on bare metal, by blocking IRQs, which block TLB flush IPIs
|
||||
- on paravirt, with MMU_GATHER_RCU_TABLE_FREE
|
||||
|
||||
This is done because some paravirt TLB flush implementations
|
||||
handle the TLB flush in the hypervisor, and will do the flush
|
||||
even when the target CPU has interrupts disabled.
|
||||
|
||||
Always handle page table freeing with MMU_GATHER_RCU_TABLE_FREE.
|
||||
Using RCU synchronization between page table freeing and get_user_pages_fast()
|
||||
allows bare metal to also do TLB flushing while interrupts are disabled.
|
||||
|
||||
Various places in the mm do still block IRQs or disable preemption
|
||||
as an implicit way to block RCU frees.
|
||||
|
||||
That makes it safe to use INVLPGB on AMD CPUs.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Suggested-by: Peter Zijlstra <peterz@infradead.org>
|
||||
---
|
||||
arch/x86/Kconfig | 2 +-
|
||||
arch/x86/kernel/paravirt.c | 7 +------
|
||||
arch/x86/mm/pgtable.c | 16 ++++------------
|
||||
3 files changed, 6 insertions(+), 19 deletions(-)
|
||||
|
||||
--- a/arch/x86/Kconfig
|
||||
+++ b/arch/x86/Kconfig
|
||||
@@ -270,7 +270,7 @@ config X86
|
||||
select HAVE_PCI
|
||||
select HAVE_PERF_REGS
|
||||
select HAVE_PERF_USER_STACK_DUMP
|
||||
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
|
||||
+ select MMU_GATHER_RCU_TABLE_FREE
|
||||
select MMU_GATHER_MERGE_VMAS
|
||||
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
|
||||
select HAVE_REGS_AND_STACK_ACCESS_API
|
||||
--- a/arch/x86/kernel/paravirt.c
|
||||
+++ b/arch/x86/kernel/paravirt.c
|
||||
@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void)
|
||||
static_branch_enable(&virt_spin_lock_key);
|
||||
}
|
||||
|
||||
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- tlb_remove_page(tlb, table);
|
||||
-}
|
||||
-
|
||||
struct static_key paravirt_steal_enabled;
|
||||
struct static_key paravirt_steal_rq_enabled;
|
||||
|
||||
@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops =
|
||||
.mmu.flush_tlb_kernel = native_flush_tlb_global,
|
||||
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
|
||||
.mmu.flush_tlb_multi = native_flush_tlb_multi,
|
||||
- .mmu.tlb_remove_table = native_tlb_remove_table,
|
||||
+ .mmu.tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.mmu.exit_mmap = paravirt_nop,
|
||||
.mmu.notify_page_enc_status_changed = paravirt_nop,
|
||||
--- a/arch/x86/mm/pgtable.c
|
||||
+++ b/arch/x86/mm/pgtable.c
|
||||
@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask);
|
||||
#define PGTABLE_HIGHMEM 0
|
||||
#endif
|
||||
|
||||
-#ifndef CONFIG_PARAVIRT
|
||||
-static inline
|
||||
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- tlb_remove_page(tlb, table);
|
||||
-}
|
||||
-#endif
|
||||
-
|
||||
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
|
||||
|
||||
pgtable_t pte_alloc_one(struct mm_struct *mm)
|
||||
@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather *
|
||||
{
|
||||
pagetable_pte_dtor(page_ptdesc(pte));
|
||||
paravirt_release_pte(page_to_pfn(pte));
|
||||
- paravirt_tlb_remove_table(tlb, pte);
|
||||
+ tlb_remove_table(tlb, pte);
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 2
|
||||
@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather *
|
||||
tlb->need_flush_all = 1;
|
||||
#endif
|
||||
pagetable_pmd_dtor(ptdesc);
|
||||
- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
|
||||
+ tlb_remove_table(tlb, ptdesc_page(ptdesc));
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 3
|
||||
@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather *
|
||||
|
||||
pagetable_pud_dtor(ptdesc);
|
||||
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
|
||||
- paravirt_tlb_remove_table(tlb, virt_to_page(pud));
|
||||
+ tlb_remove_table(tlb, virt_to_page(pud));
|
||||
}
|
||||
|
||||
#if CONFIG_PGTABLE_LEVELS > 4
|
||||
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
|
||||
{
|
||||
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
|
||||
- paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
|
||||
+ tlb_remove_table(tlb, virt_to_page(p4d));
|
||||
}
|
||||
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
|
||||
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
|
84
debian/patches/patchset-zen/nvlpgb-v7/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
vendored
Normal file
84
debian/patches/patchset-zen/nvlpgb-v7/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
vendored
Normal file
@ -0,0 +1,84 @@
|
||||
From df8f812b62c450b98b972ad0a4d5a5ba400f5eae Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:21 -0500
|
||||
Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call
|
||||
|
||||
Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
|
||||
|
||||
Get rid of the indirection by simply calling tlb_remove_table directly,
|
||||
and not going through the paravirt function pointers.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
---
|
||||
arch/x86/hyperv/mmu.c | 1 -
|
||||
arch/x86/include/asm/paravirt.h | 5 -----
|
||||
arch/x86/include/asm/paravirt_types.h | 2 --
|
||||
arch/x86/kernel/kvm.c | 1 -
|
||||
arch/x86/kernel/paravirt.c | 1 -
|
||||
arch/x86/xen/mmu_pv.c | 1 -
|
||||
6 files changed, 11 deletions(-)
|
||||
|
||||
--- a/arch/x86/hyperv/mmu.c
|
||||
+++ b/arch/x86/hyperv/mmu.c
|
||||
@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void)
|
||||
|
||||
pr_info("Using hypercall for remote TLB flush\n");
|
||||
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
|
||||
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
|
||||
}
|
||||
--- a/arch/x86/include/asm/paravirt.h
|
||||
+++ b/arch/x86/include/asm/paravirt.h
|
||||
@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
|
||||
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
|
||||
}
|
||||
|
||||
-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
|
||||
-}
|
||||
-
|
||||
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
|
||||
{
|
||||
PVOP_VCALL1(mmu.exit_mmap, mm);
|
||||
--- a/arch/x86/include/asm/paravirt_types.h
|
||||
+++ b/arch/x86/include/asm/paravirt_types.h
|
||||
@@ -136,8 +136,6 @@ struct pv_mmu_ops {
|
||||
void (*flush_tlb_multi)(const struct cpumask *cpus,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
|
||||
-
|
||||
/* Hook for intercepting the destruction of an mm_struct. */
|
||||
void (*exit_mmap)(struct mm_struct *mm);
|
||||
void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
|
||||
--- a/arch/x86/kernel/kvm.c
|
||||
+++ b/arch/x86/kernel/kvm.c
|
||||
@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
|
||||
#ifdef CONFIG_SMP
|
||||
if (pv_tlb_flush_supported()) {
|
||||
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
|
||||
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
|
||||
pr_info("KVM setup pv remote TLB flush\n");
|
||||
}
|
||||
|
||||
--- a/arch/x86/kernel/paravirt.c
|
||||
+++ b/arch/x86/kernel/paravirt.c
|
||||
@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops =
|
||||
.mmu.flush_tlb_kernel = native_flush_tlb_global,
|
||||
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
|
||||
.mmu.flush_tlb_multi = native_flush_tlb_multi,
|
||||
- .mmu.tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.mmu.exit_mmap = paravirt_nop,
|
||||
.mmu.notify_page_enc_status_changed = paravirt_nop,
|
||||
--- a/arch/x86/xen/mmu_pv.c
|
||||
+++ b/arch/x86/xen/mmu_pv.c
|
||||
@@ -2137,7 +2137,6 @@ static const typeof(pv_ops) xen_mmu_ops
|
||||
.flush_tlb_kernel = xen_flush_tlb,
|
||||
.flush_tlb_one_user = xen_flush_tlb_one_user,
|
||||
.flush_tlb_multi = xen_flush_tlb_multi,
|
||||
- .tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.pgd_alloc = xen_pgd_alloc,
|
||||
.pgd_free = xen_pgd_free,
|
@ -0,0 +1,93 @@
|
||||
From 8b2bd3f69b50cfe59eee4506413715878bcbb901 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:22 -0500
|
||||
Subject: x86/mm: consolidate full flush threshold decision
|
||||
|
||||
Reduce code duplication by consolidating the decision point
|
||||
for whether to do individual invalidations or a full flush
|
||||
inside get_flush_tlb_info.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Suggested-by: Dave Hansen <dave.hansen@intel.com>
|
||||
---
|
||||
arch/x86/mm/tlb.c | 43 ++++++++++++++++++++-----------------------
|
||||
1 file changed, 20 insertions(+), 23 deletions(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -981,6 +981,15 @@ static struct flush_tlb_info *get_flush_
|
||||
info->new_tlb_gen = new_tlb_gen;
|
||||
info->initiating_cpu = smp_processor_id();
|
||||
|
||||
+ /*
|
||||
+ * If the number of flushes is so large that a full flush
|
||||
+ * would be faster, do a full flush.
|
||||
+ */
|
||||
+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
|
||||
+ info->start = 0;
|
||||
+ info->end = TLB_FLUSH_ALL;
|
||||
+ }
|
||||
+
|
||||
return info;
|
||||
}
|
||||
|
||||
@@ -998,17 +1007,8 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
bool freed_tables)
|
||||
{
|
||||
struct flush_tlb_info *info;
|
||||
+ int cpu = get_cpu();
|
||||
u64 new_tlb_gen;
|
||||
- int cpu;
|
||||
-
|
||||
- cpu = get_cpu();
|
||||
-
|
||||
- /* Should we flush just the requested range? */
|
||||
- if ((end == TLB_FLUSH_ALL) ||
|
||||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
|
||||
- start = 0;
|
||||
- end = TLB_FLUSH_ALL;
|
||||
- }
|
||||
|
||||
/* This is also a barrier that synchronizes with switch_mm(). */
|
||||
new_tlb_gen = inc_mm_tlb_gen(mm);
|
||||
@@ -1060,22 +1060,19 @@ static void do_kernel_range_flush(void *
|
||||
|
||||
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
{
|
||||
- /* Balance as user space task's flush, a bit conservative */
|
||||
- if (end == TLB_FLUSH_ALL ||
|
||||
- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
|
||||
- on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
- } else {
|
||||
- struct flush_tlb_info *info;
|
||||
+ struct flush_tlb_info *info;
|
||||
|
||||
- preempt_disable();
|
||||
- info = get_flush_tlb_info(NULL, start, end, 0, false,
|
||||
- TLB_GENERATION_INVALID);
|
||||
+ guard(preempt)();
|
||||
+
|
||||
+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
|
||||
+ TLB_GENERATION_INVALID);
|
||||
|
||||
+ if (info->end == TLB_FLUSH_ALL)
|
||||
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+ else
|
||||
on_each_cpu(do_kernel_range_flush, info, 1);
|
||||
|
||||
- put_flush_tlb_info();
|
||||
- preempt_enable();
|
||||
- }
|
||||
+ put_flush_tlb_info();
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1247,7 +1244,7 @@ void arch_tlbbatch_flush(struct arch_tlb
|
||||
|
||||
int cpu = get_cpu();
|
||||
|
||||
- info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
|
||||
+ info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, PAGE_SHIFT, false,
|
||||
TLB_GENERATION_INVALID);
|
||||
/*
|
||||
* flush_tlb_multi() is not optimized for the common case in which only
|
89
debian/patches/patchset-zen/nvlpgb-v7/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
vendored
Normal file
89
debian/patches/patchset-zen/nvlpgb-v7/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
vendored
Normal file
@ -0,0 +1,89 @@
|
||||
From a182b0471ba3c3329d93abfa07e3d452183a9137 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:23 -0500
|
||||
Subject: x86/mm: get INVLPGB count max from CPUID
|
||||
|
||||
The CPU advertises the maximum number of pages that can be shot down
|
||||
with one INVLPGB instruction in the CPUID data.
|
||||
|
||||
Save that information for later use.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/Kconfig.cpu | 5 +++++
|
||||
arch/x86/include/asm/cpufeatures.h | 1 +
|
||||
arch/x86/include/asm/tlbflush.h | 7 +++++++
|
||||
arch/x86/kernel/cpu/amd.c | 8 ++++++++
|
||||
4 files changed, 21 insertions(+)
|
||||
|
||||
--- a/arch/x86/Kconfig.cpu
|
||||
+++ b/arch/x86/Kconfig.cpu
|
||||
@@ -726,6 +726,10 @@ config X86_VMX_FEATURE_NAMES
|
||||
def_bool y
|
||||
depends on IA32_FEAT_CTL
|
||||
|
||||
+config X86_BROADCAST_TLB_FLUSH
|
||||
+ def_bool y
|
||||
+ depends on CPU_SUP_AMD && 64BIT
|
||||
+
|
||||
menuconfig PROCESSOR_SELECT
|
||||
bool "Supported processor vendors" if EXPERT
|
||||
help
|
||||
@@ -762,6 +766,7 @@ config CPU_SUP_CYRIX_32
|
||||
config CPU_SUP_AMD
|
||||
default y
|
||||
bool "Support AMD processors" if PROCESSOR_SELECT
|
||||
+ select X86_BROADCAST_TLB_FLUSH
|
||||
help
|
||||
This enables detection, tunings and quirks for AMD processors
|
||||
|
||||
--- a/arch/x86/include/asm/cpufeatures.h
|
||||
+++ b/arch/x86/include/asm/cpufeatures.h
|
||||
@@ -335,6 +335,7 @@
|
||||
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
|
||||
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
|
||||
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
|
||||
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instruction supported. */
|
||||
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
|
||||
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
|
||||
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -183,6 +183,13 @@ static inline void cr4_init_shadow(void)
|
||||
extern unsigned long mmu_cr4_features;
|
||||
extern u32 *trampoline_cr4_features;
|
||||
|
||||
+/* How many pages can we invalidate with one INVLPGB. */
|
||||
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
|
||||
+extern u16 invlpgb_count_max;
|
||||
+#else
|
||||
+#define invlpgb_count_max 1
|
||||
+#endif
|
||||
+
|
||||
extern void initialize_tlbstate_and_flush(void);
|
||||
|
||||
/*
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -29,6 +29,8 @@
|
||||
|
||||
#include "cpu.h"
|
||||
|
||||
+u16 invlpgb_count_max __ro_after_init;
|
||||
+
|
||||
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
|
||||
{
|
||||
u32 gprs[8] = { 0 };
|
||||
@@ -1135,6 +1137,12 @@ static void cpu_detect_tlb_amd(struct cp
|
||||
tlb_lli_2m[ENTRIES] = eax & mask;
|
||||
|
||||
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
|
||||
+
|
||||
+ /* Max number of pages INVLPGB can invalidate in one shot */
|
||||
+ if (boot_cpu_has(X86_FEATURE_INVLPGB)) {
|
||||
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
|
||||
+ invlpgb_count_max = (edx & 0xffff) + 1;
|
||||
+ }
|
||||
}
|
||||
|
||||
static const struct cpu_dev amd_cpu_dev = {
|
129
debian/patches/patchset-zen/nvlpgb-v7/0005-x86-mm-add-INVLPGB-support-code.patch
vendored
Normal file
129
debian/patches/patchset-zen/nvlpgb-v7/0005-x86-mm-add-INVLPGB-support-code.patch
vendored
Normal file
@ -0,0 +1,129 @@
|
||||
From cc3f8dd3033c79abd9f37a94efed74a535a703c9 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:24 -0500
|
||||
Subject: x86/mm: add INVLPGB support code
|
||||
|
||||
Add invlpgb.h with the helper functions and definitions needed to use
|
||||
broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/invlpgb.h | 101 ++++++++++++++++++++++++++++++++
|
||||
arch/x86/include/asm/tlbflush.h | 1 +
|
||||
2 files changed, 102 insertions(+)
|
||||
create mode 100644 arch/x86/include/asm/invlpgb.h
|
||||
|
||||
--- /dev/null
|
||||
+++ b/arch/x86/include/asm/invlpgb.h
|
||||
@@ -0,0 +1,101 @@
|
||||
+/* SPDX-License-Identifier: GPL-2.0 */
|
||||
+#ifndef _ASM_X86_INVLPGB
|
||||
+#define _ASM_X86_INVLPGB
|
||||
+
|
||||
+#include <linux/kernel.h>
|
||||
+#include <vdso/bits.h>
|
||||
+#include <vdso/page.h>
|
||||
+
|
||||
+/*
|
||||
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
|
||||
+ *
|
||||
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
|
||||
+ * be done in a parallel fashion.
|
||||
+ *
|
||||
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
|
||||
+ * this CPU have completed.
|
||||
+ */
|
||||
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
|
||||
+ unsigned long addr, u16 extra_count,
|
||||
+ bool pmd_stride, u8 flags)
|
||||
+{
|
||||
+ u32 edx = (pcid << 16) | asid;
|
||||
+ u32 ecx = (pmd_stride << 31) | extra_count;
|
||||
+ u64 rax = addr | flags;
|
||||
+
|
||||
+ /* The low bits in rax are for flags. Verify addr is clean. */
|
||||
+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
|
||||
+
|
||||
+ /* INVLPGB; supported in binutils >= 2.36. */
|
||||
+ asm volatile(".byte 0x0f, 0x01, 0xfe" : : "a" (rax), "c" (ecx), "d" (edx));
|
||||
+}
|
||||
+
|
||||
+/* Wait for INVLPGB originated by this CPU to complete. */
|
||||
+static inline void tlbsync(void)
|
||||
+{
|
||||
+ cant_migrate();
|
||||
+ /* TLBSYNC: supported in binutils >= 0.36. */
|
||||
+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
|
||||
+ * of the three. For example:
|
||||
+ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address
|
||||
+ * - INVLPGB_PCID: invalidate all TLB entries matching the PCID
|
||||
+ *
|
||||
+ * The first can be used to invalidate (kernel) mappings at a particular
|
||||
+ * address across all processes.
|
||||
+ *
|
||||
+ * The latter invalidates all TLB entries matching a PCID.
|
||||
+ */
|
||||
+#define INVLPGB_VA BIT(0)
|
||||
+#define INVLPGB_PCID BIT(1)
|
||||
+#define INVLPGB_ASID BIT(2)
|
||||
+#define INVLPGB_INCLUDE_GLOBAL BIT(3)
|
||||
+#define INVLPGB_FINAL_ONLY BIT(4)
|
||||
+#define INVLPGB_INCLUDE_NESTED BIT(5)
|
||||
+
|
||||
+/* Flush all mappings for a given pcid and addr, not including globals. */
|
||||
+static inline void invlpgb_flush_user(unsigned long pcid,
|
||||
+ unsigned long addr)
|
||||
+{
|
||||
+ __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
|
||||
+ tlbsync();
|
||||
+}
|
||||
+
|
||||
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
|
||||
+ unsigned long addr,
|
||||
+ u16 nr,
|
||||
+ bool pmd_stride)
|
||||
+{
|
||||
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings for a given PCID, not including globals. */
|
||||
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
|
||||
+{
|
||||
+ __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings, including globals, for all PCIDs. */
|
||||
+static inline void invlpgb_flush_all(void)
|
||||
+{
|
||||
+ __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
|
||||
+ tlbsync();
|
||||
+}
|
||||
+
|
||||
+/* Flush addr, including globals, for all PCIDs. */
|
||||
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
|
||||
+{
|
||||
+ __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings for all PCIDs except globals. */
|
||||
+static inline void invlpgb_flush_all_nonglobals(void)
|
||||
+{
|
||||
+ __invlpgb(0, 0, 0, 0, 0, 0);
|
||||
+ tlbsync();
|
||||
+}
|
||||
+
|
||||
+#endif /* _ASM_X86_INVLPGB */
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/special_insns.h>
|
||||
#include <asm/smp.h>
|
||||
+#include <asm/invlpgb.h>
|
||||
#include <asm/invpcid.h>
|
||||
#include <asm/pti.h>
|
||||
#include <asm/processor-flags.h>
|
58
debian/patches/patchset-zen/nvlpgb-v7/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
vendored
Normal file
58
debian/patches/patchset-zen/nvlpgb-v7/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
From 6b6686f0d7e228d0a2d8c166204adea5484c20d7 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:25 -0500
|
||||
Subject: x86/mm: use INVLPGB for kernel TLB flushes
|
||||
|
||||
Use broadcast TLB invalidation for kernel addresses when available.
|
||||
|
||||
Remove the need to send IPIs for kernel TLB flushes.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/mm/tlb.c | 28 +++++++++++++++++++++++++++-
|
||||
1 file changed, 27 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1048,6 +1048,30 @@ void flush_tlb_all(void)
|
||||
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
}
|
||||
|
||||
+static bool broadcast_kernel_range_flush(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ unsigned long addr;
|
||||
+ unsigned long nr;
|
||||
+
|
||||
+ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH))
|
||||
+ return false;
|
||||
+
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return false;
|
||||
+
|
||||
+ if (info->end == TLB_FLUSH_ALL) {
|
||||
+ invlpgb_flush_all();
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
|
||||
+ nr = min((info->end - addr) >> PAGE_SHIFT, invlpgb_count_max);
|
||||
+ invlpgb_flush_addr_nosync(addr, nr);
|
||||
+ }
|
||||
+ tlbsync();
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static void do_kernel_range_flush(void *info)
|
||||
{
|
||||
struct flush_tlb_info *f = info;
|
||||
@@ -1067,7 +1091,9 @@ void flush_tlb_kernel_range(unsigned lon
|
||||
info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
|
||||
TLB_GENERATION_INVALID);
|
||||
|
||||
- if (info->end == TLB_FLUSH_ALL)
|
||||
+ if (broadcast_kernel_range_flush(info))
|
||||
+ ; /* Fall through. */
|
||||
+ else if (info->end == TLB_FLUSH_ALL)
|
||||
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
else
|
||||
on_each_cpu(do_kernel_range_flush, info, 1);
|
44
debian/patches/patchset-zen/nvlpgb-v7/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch
vendored
Normal file
44
debian/patches/patchset-zen/nvlpgb-v7/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch
vendored
Normal file
@ -0,0 +1,44 @@
|
||||
From 6cffce503223f9076a5e16177905ba3ab6d9f7d8 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:26 -0500
|
||||
Subject: x86/mm: use INVLPGB in flush_tlb_all
|
||||
|
||||
The flush_tlb_all() function is not used a whole lot, but we might
|
||||
as well use broadcast TLB flushing there, too.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/mm/tlb.c | 15 +++++++++++++++
|
||||
1 file changed, 15 insertions(+)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1036,6 +1036,19 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
}
|
||||
|
||||
|
||||
+static bool broadcast_flush_tlb_all(void)
|
||||
+{
|
||||
+ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH))
|
||||
+ return false;
|
||||
+
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return false;
|
||||
+
|
||||
+ guard(preempt)();
|
||||
+ invlpgb_flush_all();
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static void do_flush_tlb_all(void *info)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
||||
@@ -1044,6 +1057,8 @@ static void do_flush_tlb_all(void *info)
|
||||
|
||||
void flush_tlb_all(void)
|
||||
{
|
||||
+ if (broadcast_flush_tlb_all())
|
||||
+ return;
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
||||
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
From 3d23d79d14cdd3c68dc5bffbaf34a60eaca7fa40 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:27 -0500
|
||||
Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing
|
||||
|
||||
In the page reclaim code, we only track the CPU(s) where the TLB needs
|
||||
to be flushed, rather than all the individual mappings that may be getting
|
||||
invalidated.
|
||||
|
||||
Use broadcast TLB flushing when that is available.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/mm/tlb.c | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1292,7 +1292,9 @@ void arch_tlbbatch_flush(struct arch_tlb
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ invlpgb_flush_all_nonglobals();
|
||||
+ } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
flush_tlb_multi(&batch->cpumask, info);
|
||||
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
||||
lockdep_assert_irqs_enabled();
|
@ -0,0 +1,602 @@
|
||||
From 79c9df0c7637c8ba8a1833889a2ace355d56c96e Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:28 -0500
|
||||
Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded
|
||||
processes
|
||||
|
||||
Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3
|
||||
and newer CPUs.
|
||||
|
||||
In order to not exhaust PCID space, and keep TLB flushes local for single
|
||||
threaded processes, we only hand out broadcast ASIDs to processes active on
|
||||
3 or more CPUs, and gradually increase the threshold as broadcast ASID space
|
||||
is depleted.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/mmu.h | 6 +
|
||||
arch/x86/include/asm/mmu_context.h | 14 ++
|
||||
arch/x86/include/asm/tlbflush.h | 73 ++++++
|
||||
arch/x86/mm/tlb.c | 344 ++++++++++++++++++++++++++++-
|
||||
4 files changed, 425 insertions(+), 12 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/mmu.h
|
||||
+++ b/arch/x86/include/asm/mmu.h
|
||||
@@ -67,6 +67,12 @@ typedef struct {
|
||||
u16 pkey_allocation_map;
|
||||
s16 execute_only_pkey;
|
||||
#endif
|
||||
+
|
||||
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
|
||||
+ u16 global_asid;
|
||||
+ bool asid_transition;
|
||||
+#endif
|
||||
+
|
||||
} mm_context_t;
|
||||
|
||||
#define INIT_MM_CONTEXT(mm) \
|
||||
--- a/arch/x86/include/asm/mmu_context.h
|
||||
+++ b/arch/x86/include/asm/mmu_context.h
|
||||
@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
|
||||
#define enter_lazy_tlb enter_lazy_tlb
|
||||
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
|
||||
|
||||
+extern void destroy_context_free_global_asid(struct mm_struct *mm);
|
||||
+
|
||||
/*
|
||||
* Init a new mm. Used on mm copies, like at fork()
|
||||
* and on mm's that are brand-new, like at execve().
|
||||
@@ -160,6 +162,14 @@ static inline int init_new_context(struc
|
||||
mm->context.execute_only_pkey = -1;
|
||||
}
|
||||
#endif
|
||||
+
|
||||
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ mm->context.global_asid = 0;
|
||||
+ mm->context.asid_transition = false;
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
mm_reset_untag_mask(mm);
|
||||
init_new_context_ldt(mm);
|
||||
return 0;
|
||||
@@ -169,6 +179,10 @@ static inline int init_new_context(struc
|
||||
static inline void destroy_context(struct mm_struct *mm)
|
||||
{
|
||||
destroy_context_ldt(mm);
|
||||
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ destroy_context_free_global_asid(mm);
|
||||
+#endif
|
||||
}
|
||||
|
||||
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -6,6 +6,7 @@
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
+#include <asm/barrier.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/special_insns.h>
|
||||
@@ -238,6 +239,78 @@ void flush_tlb_one_kernel(unsigned long
|
||||
void flush_tlb_multi(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
|
||||
+static inline bool is_dyn_asid(u16 asid)
|
||||
+{
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return true;
|
||||
+
|
||||
+ return asid < TLB_NR_DYN_ASIDS;
|
||||
+}
|
||||
+
|
||||
+static inline bool is_global_asid(u16 asid)
|
||||
+{
|
||||
+ return !is_dyn_asid(asid);
|
||||
+}
|
||||
+
|
||||
+static inline bool in_asid_transition(const struct flush_tlb_info *info)
|
||||
+{
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return false;
|
||||
+
|
||||
+ return info->mm && READ_ONCE(info->mm->context.asid_transition);
|
||||
+}
|
||||
+
|
||||
+static inline u16 mm_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ u16 asid;
|
||||
+
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return 0;
|
||||
+
|
||||
+ asid = smp_load_acquire(&mm->context.global_asid);
|
||||
+
|
||||
+ /* mm->context.global_asid is either 0, or a global ASID */
|
||||
+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
|
||||
+
|
||||
+ return asid;
|
||||
+}
|
||||
+#else
|
||||
+static inline bool is_dyn_asid(u16 asid)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline bool is_global_asid(u16 asid)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool in_asid_transition(const struct flush_tlb_info *info)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline u16 mm_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static inline bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline void broadcast_tlb_flush(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ VM_WARN_ON_ONCE(1);
|
||||
+}
|
||||
+
|
||||
+static inline void consider_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
#include <asm/paravirt.h>
|
||||
#endif
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -74,13 +74,15 @@
|
||||
* use different names for each of them:
|
||||
*
|
||||
* ASID - [0, TLB_NR_DYN_ASIDS-1]
|
||||
- * the canonical identifier for an mm
|
||||
+ * the canonical identifier for an mm, dynamically allocated on each CPU
|
||||
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
|
||||
+ * the canonical, global identifier for an mm, identical across all CPUs
|
||||
*
|
||||
- * kPCID - [1, TLB_NR_DYN_ASIDS]
|
||||
+ * kPCID - [1, MAX_ASID_AVAILABLE]
|
||||
* the value we write into the PCID part of CR3; corresponds to the
|
||||
* ASID+1, because PCID 0 is special.
|
||||
*
|
||||
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
|
||||
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
|
||||
* for KPTI each mm has two address spaces and thus needs two
|
||||
* PCID values, but we can still do with a single ASID denomination
|
||||
* for each mm. Corresponds to kPCID + 2048.
|
||||
@@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_st
|
||||
return;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * TLB consistency for global ASIDs is maintained with broadcast TLB
|
||||
+ * flushing. The TLB is never outdated, and does not need flushing.
|
||||
+ */
|
||||
+ if (IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH) && static_cpu_has(X86_FEATURE_INVLPGB)) {
|
||||
+ u16 global_asid = mm_global_asid(next);
|
||||
+
|
||||
+ if (global_asid) {
|
||||
+ *new_asid = global_asid;
|
||||
+ *need_flush = false;
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (this_cpu_read(cpu_tlbstate.invalidate_other))
|
||||
clear_asid_other();
|
||||
|
||||
@@ -251,6 +267,272 @@ static void choose_new_asid(struct mm_st
|
||||
*need_flush = true;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
|
||||
+/*
|
||||
+ * Logic for broadcast TLB invalidation.
|
||||
+ */
|
||||
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
|
||||
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
|
||||
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE) = { 0 };
|
||||
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE) = { 0 };
|
||||
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
|
||||
+
|
||||
+static void reset_global_asid_space(void)
|
||||
+{
|
||||
+ lockdep_assert_held(&global_asid_lock);
|
||||
+
|
||||
+ /*
|
||||
+ * A global TLB flush guarantees that any stale entries from
|
||||
+ * previously freed global ASIDs get flushed from the TLB
|
||||
+ * everywhere, making these global ASIDs safe to reuse.
|
||||
+ */
|
||||
+ invlpgb_flush_all_nonglobals();
|
||||
+
|
||||
+ /*
|
||||
+ * Clear all the previously freed global ASIDs from the
|
||||
+ * broadcast_asid_used bitmap, now that the global TLB flush
|
||||
+ * has made them actually available for re-use.
|
||||
+ */
|
||||
+ bitmap_andnot(global_asid_used, global_asid_used,
|
||||
+ global_asid_freed, MAX_ASID_AVAILABLE);
|
||||
+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
|
||||
+
|
||||
+ /*
|
||||
+ * ASIDs 0-TLB_NR_DYN_ASIDS are used for CPU-local ASID
|
||||
+ * assignments, for tasks doing IPI based TLB shootdowns.
|
||||
+ * Restart the search from the start of the global ASID space.
|
||||
+ */
|
||||
+ last_global_asid = TLB_NR_DYN_ASIDS;
|
||||
+}
|
||||
+
|
||||
+static u16 get_global_asid(void)
|
||||
+{
|
||||
+
|
||||
+ u16 asid;
|
||||
+
|
||||
+ lockdep_assert_held(&global_asid_lock);
|
||||
+
|
||||
+ /* The previous allocated ASID is at the top of the address space. */
|
||||
+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
|
||||
+ reset_global_asid_space();
|
||||
+
|
||||
+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
|
||||
+
|
||||
+ if (asid >= MAX_ASID_AVAILABLE) {
|
||||
+ /* This should never happen. */
|
||||
+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", global_asid_available);
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ /* Claim this global ASID. */
|
||||
+ __set_bit(asid, global_asid_used);
|
||||
+ last_global_asid = asid;
|
||||
+ global_asid_available--;
|
||||
+ return asid;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Returns true if the mm is transitioning from a CPU-local ASID to a global
|
||||
+ * (INVLPGB) ASID, or the other way around.
|
||||
+ */
|
||||
+static bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
|
||||
+{
|
||||
+ u16 global_asid = mm_global_asid(next);
|
||||
+
|
||||
+ if (global_asid && prev_asid != global_asid)
|
||||
+ return true;
|
||||
+
|
||||
+ if (!global_asid && is_global_asid(prev_asid))
|
||||
+ return true;
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+void destroy_context_free_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!mm->context.global_asid)
|
||||
+ return;
|
||||
+
|
||||
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
|
||||
+
|
||||
+ /* The global ASID can be re-used only after flush at wrap-around. */
|
||||
+ __set_bit(mm->context.global_asid, global_asid_freed);
|
||||
+
|
||||
+ mm->context.global_asid = 0;
|
||||
+ global_asid_available++;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Check whether a process is currently active on more than "threshold" CPUs.
|
||||
+ * This is a cheap estimation on whether or not it may make sense to assign
|
||||
+ * a global ASID to this process, and use broadcast TLB invalidation.
|
||||
+ */
|
||||
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
|
||||
+{
|
||||
+ int count = 0;
|
||||
+ int cpu;
|
||||
+
|
||||
+ /* This quick check should eliminate most single threaded programs. */
|
||||
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
|
||||
+ return false;
|
||||
+
|
||||
+ /* Slower check to make sure. */
|
||||
+ for_each_cpu(cpu, mm_cpumask(mm)) {
|
||||
+ /* Skip the CPUs that aren't really running this process. */
|
||||
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
|
||||
+ continue;
|
||||
+
|
||||
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
|
||||
+ continue;
|
||||
+
|
||||
+ if (++count > threshold)
|
||||
+ return true;
|
||||
+ }
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Assign a global ASID to the current process, protecting against
|
||||
+ * races between multiple threads in the process.
|
||||
+ */
|
||||
+static void use_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ u16 asid;
|
||||
+
|
||||
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
|
||||
+
|
||||
+ /* This process is already using broadcast TLB invalidation. */
|
||||
+ if (mm->context.global_asid)
|
||||
+ return;
|
||||
+
|
||||
+ /* The last global ASID was consumed while waiting for the lock. */
|
||||
+ if (!global_asid_available) {
|
||||
+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ asid = get_global_asid();
|
||||
+ if (!asid)
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
|
||||
+ * finish_asid_transition() needs to observe asid_transition = true
|
||||
+ * once it observes global_asid.
|
||||
+ */
|
||||
+ mm->context.asid_transition = true;
|
||||
+ smp_store_release(&mm->context.global_asid, asid);
|
||||
+}
|
||||
+
|
||||
+static bool meets_global_asid_threshold(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!global_asid_available)
|
||||
+ return false;
|
||||
+
|
||||
+ /*
|
||||
+ * Assign a global ASID if the process is active on
|
||||
+ * 4 or more CPUs simultaneously.
|
||||
+ */
|
||||
+ return mm_active_cpus_exceeds(mm, 3);
|
||||
+}
|
||||
+
|
||||
+static void consider_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!static_cpu_has(X86_FEATURE_INVLPGB))
|
||||
+ return;
|
||||
+
|
||||
+ /* Check every once in a while. */
|
||||
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
|
||||
+ return;
|
||||
+
|
||||
+ if (meets_global_asid_threshold(mm))
|
||||
+ use_global_asid(mm);
|
||||
+}
|
||||
+
|
||||
+static void finish_asid_transition(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ struct mm_struct *mm = info->mm;
|
||||
+ int bc_asid = mm_global_asid(mm);
|
||||
+ int cpu;
|
||||
+
|
||||
+ if (!READ_ONCE(mm->context.asid_transition))
|
||||
+ return;
|
||||
+
|
||||
+ for_each_cpu(cpu, mm_cpumask(mm)) {
|
||||
+ /*
|
||||
+ * The remote CPU is context switching. Wait for that to
|
||||
+ * finish, to catch the unlikely case of it switching to
|
||||
+ * the target mm with an out of date ASID.
|
||||
+ */
|
||||
+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
|
||||
+ cpu_relax();
|
||||
+
|
||||
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
|
||||
+ continue;
|
||||
+
|
||||
+ /*
|
||||
+ * If at least one CPU is not using the global ASID yet,
|
||||
+ * send a TLB flush IPI. The IPI should cause stragglers
|
||||
+ * to transition soon.
|
||||
+ *
|
||||
+ * This can race with the CPU switching to another task;
|
||||
+ * that results in a (harmless) extra IPI.
|
||||
+ */
|
||||
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
|
||||
+ flush_tlb_multi(mm_cpumask(info->mm), info);
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ /* All the CPUs running this process are using the global ASID. */
|
||||
+ WRITE_ONCE(mm->context.asid_transition, false);
|
||||
+}
|
||||
+
|
||||
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ bool pmd = info->stride_shift == PMD_SHIFT;
|
||||
+ unsigned long maxnr = invlpgb_count_max;
|
||||
+ unsigned long asid = info->mm->context.global_asid;
|
||||
+ unsigned long addr = info->start;
|
||||
+ unsigned long nr;
|
||||
+
|
||||
+ /* Flushing multiple pages at once is not supported with 1GB pages. */
|
||||
+ if (info->stride_shift > PMD_SHIFT)
|
||||
+ maxnr = 1;
|
||||
+
|
||||
+ /*
|
||||
+ * TLB flushes with INVLPGB are kicked off asynchronously.
|
||||
+ * The inc_mm_tlb_gen() guarantees page table updates are done
|
||||
+ * before these TLB flushes happen.
|
||||
+ */
|
||||
+ if (info->end == TLB_FLUSH_ALL) {
|
||||
+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
|
||||
+ /* Do any CPUs supporting INVLPGB need PTI? */
|
||||
+ if (static_cpu_has(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_single_pcid_nosync(user_pcid(asid));
|
||||
+ } else do {
|
||||
+ /*
|
||||
+ * Calculate how many pages can be flushed at once; if the
|
||||
+ * remainder of the range is less than one page, flush one.
|
||||
+ */
|
||||
+ nr = min(maxnr, (info->end - addr) >> info->stride_shift);
|
||||
+ nr = max(nr, 1);
|
||||
+
|
||||
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
|
||||
+ /* Do any CPUs supporting INVLPGB need PTI? */
|
||||
+ if (static_cpu_has(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
|
||||
+
|
||||
+ addr += nr << info->stride_shift;
|
||||
+ } while (addr < info->end);
|
||||
+
|
||||
+ finish_asid_transition(info);
|
||||
+
|
||||
+ /* Wait for the INVLPGBs kicked off above to finish. */
|
||||
+ tlbsync();
|
||||
+}
|
||||
+#endif /* CONFIG_X86_BROADCAST_TLB_FLUSH */
|
||||
+
|
||||
/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
@@ -556,8 +838,9 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
*/
|
||||
if (prev == next) {
|
||||
/* Not actually switching mm's */
|
||||
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
- next->context.ctx_id);
|
||||
+ VM_WARN_ON(is_dyn_asid(prev_asid) &&
|
||||
+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
+ next->context.ctx_id);
|
||||
|
||||
/*
|
||||
* If this races with another thread that enables lam, 'new_lam'
|
||||
@@ -574,6 +857,23 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
/*
|
||||
+ * Check if the current mm is transitioning to a new ASID.
|
||||
+ */
|
||||
+ if (needs_global_asid_reload(next, prev_asid)) {
|
||||
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
+
|
||||
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
+ goto reload_tlb;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Broadcast TLB invalidation keeps this PCID up to date
|
||||
+ * all the time.
|
||||
+ */
|
||||
+ if (is_global_asid(prev_asid))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
* If the CPU is not in lazy TLB mode, we are just switching
|
||||
* from one thread in a process to another thread in the same
|
||||
* process. No TLB flush required.
|
||||
@@ -607,6 +907,13 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
cond_mitigation(tsk);
|
||||
|
||||
/*
|
||||
+ * Let nmi_uaccess_okay() and finish_asid_transition()
|
||||
+ * know that we're changing CR3.
|
||||
+ */
|
||||
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
||||
+ barrier();
|
||||
+
|
||||
+ /*
|
||||
* Stop remote flushes for the previous mm.
|
||||
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
|
||||
* but the bitmap manipulation can cause cache line contention.
|
||||
@@ -623,14 +930,12 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
|
||||
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
-
|
||||
- /* Let nmi_uaccess_okay() know that we're changing CR3. */
|
||||
- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
||||
- barrier();
|
||||
}
|
||||
|
||||
+reload_tlb:
|
||||
new_lam = mm_lam_cr3_mask(next);
|
||||
if (need_flush) {
|
||||
+ VM_WARN_ON_ONCE(is_global_asid(new_asid));
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
|
||||
@@ -749,7 +1054,7 @@ static void flush_tlb_func(void *info)
|
||||
const struct flush_tlb_info *f = info;
|
||||
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
||||
+ u64 local_tlb_gen;
|
||||
bool local = smp_processor_id() == f->initiating_cpu;
|
||||
unsigned long nr_invalidate = 0;
|
||||
u64 mm_tlb_gen;
|
||||
@@ -769,6 +1074,16 @@ static void flush_tlb_func(void *info)
|
||||
if (unlikely(loaded_mm == &init_mm))
|
||||
return;
|
||||
|
||||
+ /* Reload the ASID if transitioning into or out of a global ASID */
|
||||
+ if (needs_global_asid_reload(loaded_mm, loaded_mm_asid)) {
|
||||
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
|
||||
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
+ }
|
||||
+
|
||||
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
|
||||
+ if (is_global_asid(loaded_mm_asid))
|
||||
+ return;
|
||||
+
|
||||
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
|
||||
loaded_mm->context.ctx_id);
|
||||
|
||||
@@ -786,6 +1101,8 @@ static void flush_tlb_func(void *info)
|
||||
return;
|
||||
}
|
||||
|
||||
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
||||
+
|
||||
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
|
||||
f->new_tlb_gen <= local_tlb_gen)) {
|
||||
/*
|
||||
@@ -926,7 +1243,7 @@ STATIC_NOPV void native_flush_tlb_multi(
|
||||
* up on the new contents of what used to be page tables, while
|
||||
* doing a speculative memory access.
|
||||
*/
|
||||
- if (info->freed_tables)
|
||||
+ if (info->freed_tables || in_asid_transition(info))
|
||||
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
|
||||
else
|
||||
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
|
||||
@@ -1021,8 +1338,11 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
|
||||
+ if (mm_global_asid(mm)) {
|
||||
+ broadcast_tlb_flush(info);
|
||||
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
|
||||
flush_tlb_multi(mm_cpumask(mm), info);
|
||||
+ consider_global_asid(mm);
|
||||
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
|
||||
lockdep_assert_irqs_enabled();
|
||||
local_irq_disable();
|
@ -0,0 +1,135 @@
|
||||
From 647727eaa06fc61fbc55de4c09ab0c0fe7bc7263 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:29 -0500
|
||||
Subject: x86/mm: do targeted broadcast flushing from tlbbatch code
|
||||
|
||||
Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
|
||||
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
|
||||
|
||||
This also allows us to avoid adding the CPUs of processes using broadcast
|
||||
flushing to the batch->cpumask, and will hopefully further reduce TLB
|
||||
flushing from the reclaim and compaction paths.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/tlbbatch.h | 1 +
|
||||
arch/x86/include/asm/tlbflush.h | 12 ++-----
|
||||
arch/x86/mm/tlb.c | 57 +++++++++++++++++++++++++++++++--
|
||||
3 files changed, 58 insertions(+), 12 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlbbatch.h
|
||||
+++ b/arch/x86/include/asm/tlbbatch.h
|
||||
@@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch {
|
||||
* the PFNs being flushed..
|
||||
*/
|
||||
struct cpumask cpumask;
|
||||
+ bool used_invlpgb;
|
||||
};
|
||||
|
||||
#endif /* _ARCH_X86_TLBBATCH_H */
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -358,21 +358,15 @@ static inline u64 inc_mm_tlb_gen(struct
|
||||
return atomic64_inc_return(&mm->context.tlb_gen);
|
||||
}
|
||||
|
||||
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
- struct mm_struct *mm,
|
||||
- unsigned long uaddr)
|
||||
-{
|
||||
- inc_mm_tlb_gen(mm);
|
||||
- cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
|
||||
- mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
|
||||
-}
|
||||
-
|
||||
static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
|
||||
{
|
||||
flush_tlb_mm(mm);
|
||||
}
|
||||
|
||||
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
|
||||
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
+ struct mm_struct *mm,
|
||||
+ unsigned long uaddr);
|
||||
|
||||
static inline bool pte_flags_need_flush(unsigned long oldflags,
|
||||
unsigned long newflags,
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1612,9 +1612,7 @@ void arch_tlbbatch_flush(struct arch_tlb
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
- invlpgb_flush_all_nonglobals();
|
||||
- } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
flush_tlb_multi(&batch->cpumask, info);
|
||||
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
||||
lockdep_assert_irqs_enabled();
|
||||
@@ -1623,12 +1621,65 @@ void arch_tlbbatch_flush(struct arch_tlb
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * If we issued (asynchronous) INVLPGB flushes, wait for them here.
|
||||
+ * The cpumask above contains only CPUs that were running tasks
|
||||
+ * not using broadcast TLB flushing.
|
||||
+ */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
|
||||
+ tlbsync();
|
||||
+ migrate_enable();
|
||||
+ batch->used_invlpgb = false;
|
||||
+ }
|
||||
+
|
||||
cpumask_clear(&batch->cpumask);
|
||||
|
||||
put_flush_tlb_info();
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
+ struct mm_struct *mm,
|
||||
+ unsigned long uaddr)
|
||||
+{
|
||||
+ u16 asid = mm_global_asid(mm);
|
||||
+
|
||||
+ if (asid) {
|
||||
+ /*
|
||||
+ * Queue up an asynchronous invalidation. The corresponding
|
||||
+ * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
|
||||
+ * on the same CPU.
|
||||
+ */
|
||||
+ if (!batch->used_invlpgb) {
|
||||
+ batch->used_invlpgb = true;
|
||||
+ migrate_disable();
|
||||
+ }
|
||||
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
|
||||
+ /* Do any CPUs supporting INVLPGB need PTI? */
|
||||
+ if (static_cpu_has(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
|
||||
+
|
||||
+ /*
|
||||
+ * Some CPUs might still be using a local ASID for this
|
||||
+ * process, and require IPIs, while others are using the
|
||||
+ * global ASID.
|
||||
+ *
|
||||
+ * In this corner case we need to do both the broadcast
|
||||
+ * TLB invalidation, and send IPIs. The IPIs will help
|
||||
+ * stragglers transition to the broadcast ASID.
|
||||
+ */
|
||||
+ if (READ_ONCE(mm->context.asid_transition))
|
||||
+ asid = 0;
|
||||
+ }
|
||||
+
|
||||
+ if (!asid) {
|
||||
+ inc_mm_tlb_gen(mm);
|
||||
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
|
||||
+ }
|
||||
+
|
||||
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Blindly accessing user memory from NMI context can be dangerous
|
||||
* if we're in the middle of switching the current user task or
|
79
debian/patches/patchset-zen/nvlpgb-v7/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
vendored
Normal file
79
debian/patches/patchset-zen/nvlpgb-v7/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
vendored
Normal file
@ -0,0 +1,79 @@
|
||||
From 0678da9f0870f0d211d49808a66e98abc0c58438 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:30 -0500
|
||||
Subject: x86/mm: enable AMD translation cache extensions
|
||||
|
||||
With AMD TCE (translation cache extensions) only the intermediate mappings
|
||||
that cover the address range zapped by INVLPG / INVLPGB get invalidated,
|
||||
rather than all intermediate mappings getting zapped at every TLB invalidation.
|
||||
|
||||
This can help reduce the TLB miss rate, by keeping more intermediate
|
||||
mappings in the cache.
|
||||
|
||||
From the AMD manual:
|
||||
|
||||
Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit
|
||||
to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on
|
||||
TLB entries. When this bit is 0, these instructions remove the target PTE
|
||||
from the TLB as well as all upper-level table entries that are cached
|
||||
in the TLB, whether or not they are associated with the target PTE.
|
||||
When this bit is set, these instructions will remove the target PTE and
|
||||
only those upper-level entries that lead to the target PTE in
|
||||
the page table hierarchy, leaving unrelated upper-level entries intact.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/msr-index.h | 2 ++
|
||||
arch/x86/kernel/cpu/amd.c | 4 ++++
|
||||
tools/arch/x86/include/asm/msr-index.h | 2 ++
|
||||
3 files changed, 8 insertions(+)
|
||||
|
||||
--- a/arch/x86/include/asm/msr-index.h
|
||||
+++ b/arch/x86/include/asm/msr-index.h
|
||||
@@ -25,6 +25,7 @@
|
||||
#define _EFER_SVME 12 /* Enable virtualization */
|
||||
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
|
||||
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
|
||||
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
|
||||
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
|
||||
|
||||
#define EFER_SCE (1<<_EFER_SCE)
|
||||
@@ -34,6 +35,7 @@
|
||||
#define EFER_SVME (1<<_EFER_SVME)
|
||||
#define EFER_LMSLE (1<<_EFER_LMSLE)
|
||||
#define EFER_FFXSR (1<<_EFER_FFXSR)
|
||||
+#define EFER_TCE (1<<_EFER_TCE)
|
||||
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
|
||||
|
||||
/*
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -1071,6 +1071,10 @@ static void init_amd(struct cpuinfo_x86
|
||||
|
||||
/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
|
||||
clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
|
||||
+
|
||||
+ /* Enable Translation Cache Extension */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_TCE))
|
||||
+ msr_set_bit(MSR_EFER, _EFER_TCE);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
--- a/tools/arch/x86/include/asm/msr-index.h
|
||||
+++ b/tools/arch/x86/include/asm/msr-index.h
|
||||
@@ -25,6 +25,7 @@
|
||||
#define _EFER_SVME 12 /* Enable virtualization */
|
||||
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
|
||||
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
|
||||
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
|
||||
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
|
||||
|
||||
#define EFER_SCE (1<<_EFER_SCE)
|
||||
@@ -34,6 +35,7 @@
|
||||
#define EFER_SVME (1<<_EFER_SVME)
|
||||
#define EFER_LMSLE (1<<_EFER_LMSLE)
|
||||
#define EFER_FFXSR (1<<_EFER_FFXSR)
|
||||
+#define EFER_TCE (1<<_EFER_TCE)
|
||||
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
|
||||
|
||||
/*
|
@ -0,0 +1,66 @@
|
||||
From 02d1759eda082f9595f3232f5dffd5d49943924a Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 22 Jan 2025 23:23:31 -0500
|
||||
Subject: x86/mm: only invalidate final translations with INVLPGB
|
||||
|
||||
Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB.
|
||||
This way only leaf mappings get removed from the TLB, leaving intermediate
|
||||
translations cached.
|
||||
|
||||
On the (rare) occasions where we free page tables we do a full flush,
|
||||
ensuring intermediate translations get flushed from the TLB.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
---
|
||||
arch/x86/include/asm/invlpgb.h | 10 ++++++++--
|
||||
arch/x86/mm/tlb.c | 8 ++++----
|
||||
2 files changed, 12 insertions(+), 6 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/invlpgb.h
|
||||
+++ b/arch/x86/include/asm/invlpgb.h
|
||||
@@ -67,9 +67,15 @@ static inline void invlpgb_flush_user(un
|
||||
static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
|
||||
unsigned long addr,
|
||||
u16 nr,
|
||||
- bool pmd_stride)
|
||||
+ bool pmd_stride,
|
||||
+ bool freed_tables)
|
||||
{
|
||||
- __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
|
||||
+ unsigned long flags = INVLPGB_PCID | INVLPGB_VA;
|
||||
+
|
||||
+ if (!freed_tables)
|
||||
+ flags |= INVLPGB_FINAL_ONLY;
|
||||
+
|
||||
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, flags);
|
||||
}
|
||||
|
||||
/* Flush all mappings for a given PCID, not including globals. */
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -518,10 +518,10 @@ static void broadcast_tlb_flush(struct f
|
||||
nr = min(maxnr, (info->end - addr) >> info->stride_shift);
|
||||
nr = max(nr, 1);
|
||||
|
||||
- invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
|
||||
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd, info->freed_tables);
|
||||
/* Do any CPUs supporting INVLPGB need PTI? */
|
||||
if (static_cpu_has(X86_FEATURE_PTI))
|
||||
- invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
|
||||
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd, info->freed_tables);
|
||||
|
||||
addr += nr << info->stride_shift;
|
||||
} while (addr < info->end);
|
||||
@@ -1654,10 +1654,10 @@ void arch_tlbbatch_add_pending(struct ar
|
||||
batch->used_invlpgb = true;
|
||||
migrate_disable();
|
||||
}
|
||||
- invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
|
||||
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false, false);
|
||||
/* Do any CPUs supporting INVLPGB need PTI? */
|
||||
if (static_cpu_has(X86_FEATURE_PTI))
|
||||
- invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
|
||||
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false, false);
|
||||
|
||||
/*
|
||||
* Some CPUs might still be using a local ASID for this
|
94
debian/patches/patchset-zen/nvlpgb-v7/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
vendored
Normal file
94
debian/patches/patchset-zen/nvlpgb-v7/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
vendored
Normal file
@ -0,0 +1,94 @@
|
||||
From b61dfc43cfc7511795366dfd9260f0959ca2f2d2 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Thu, 19 Dec 2024 15:32:53 -0500
|
||||
Subject: mm: remove unnecessary calls to lru_add_drain
|
||||
|
||||
There seem to be several categories of calls to lru_add_drain
|
||||
and lru_add_drain_all.
|
||||
|
||||
The first are code paths that recently allocated, swapped in,
|
||||
or otherwise processed a batch of pages, and want them all on
|
||||
the LRU. These drain pages that were recently allocated,
|
||||
probably on the local CPU.
|
||||
|
||||
A second category are code paths that are actively trying to
|
||||
reclaim, migrate, or offline memory. These often use lru_add_drain_all,
|
||||
to drain the caches on all CPUs.
|
||||
|
||||
However, there also seem to be some other callers where we
|
||||
aren't really doing either. They are calling lru_add_drain(),
|
||||
despite operating on pages that may have been allocated
|
||||
long ago, and quite possibly on different CPUs.
|
||||
|
||||
Those calls are not likely to be effective at anything but
|
||||
creating lock contention on the LRU locks.
|
||||
|
||||
Remove the lru_add_drain calls in the latter category.
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Suggested-by: David Hildenbrand <david@redhat.com>
|
||||
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
|
||||
Acked-by: David Hildenbrand <david@redhat.com>
|
||||
---
|
||||
mm/memory.c | 1 -
|
||||
mm/mmap.c | 2 --
|
||||
mm/swap_state.c | 1 -
|
||||
mm/vma.c | 2 --
|
||||
4 files changed, 6 deletions(-)
|
||||
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are
|
||||
struct mmu_notifier_range range;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
- lru_add_drain();
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
|
||||
address, end);
|
||||
hugetlb_zap_begin(vma, &range.start, &range.end);
|
||||
--- a/mm/mmap.c
|
||||
+++ b/mm/mmap.c
|
||||
@@ -1931,7 +1931,6 @@ void exit_mmap(struct mm_struct *mm)
|
||||
goto destroy;
|
||||
}
|
||||
|
||||
- lru_add_drain();
|
||||
flush_cache_mm(mm);
|
||||
tlb_gather_mmu_fullmm(&tlb, mm);
|
||||
/* update_hiwater_rss(mm) here? but nobody should be looking */
|
||||
@@ -2374,7 +2373,6 @@ int relocate_vma_down(struct vm_area_str
|
||||
vma, new_start, length, false, true))
|
||||
return -ENOMEM;
|
||||
|
||||
- lru_add_drain();
|
||||
tlb_gather_mmu(&tlb, mm);
|
||||
next = vma_next(&vmi);
|
||||
if (new_end > old_start) {
|
||||
--- a/mm/swap_state.c
|
||||
+++ b/mm/swap_state.c
|
||||
@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en
|
||||
struct folio_batch folios;
|
||||
unsigned int refs[PAGEVEC_SIZE];
|
||||
|
||||
- lru_add_drain();
|
||||
folio_batch_init(&folios);
|
||||
for (int i = 0; i < nr; i++) {
|
||||
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
|
||||
--- a/mm/vma.c
|
||||
+++ b/mm/vma.c
|
||||
@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas,
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
- lru_add_drain();
|
||||
tlb_gather_mmu(&tlb, mm);
|
||||
update_hiwater_rss(mm);
|
||||
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
|
||||
@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct
|
||||
* were isolated before we downgraded mmap_lock.
|
||||
*/
|
||||
mas_set(mas_detach, 1);
|
||||
- lru_add_drain();
|
||||
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
|
||||
update_hiwater_rss(vms->vma->vm_mm);
|
||||
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
|
429
debian/patches/patchset-zen/nvlpgb-v7/0014-vdso-Introduce-vdso-page.h.patch
vendored
Normal file
429
debian/patches/patchset-zen/nvlpgb-v7/0014-vdso-Introduce-vdso-page.h.patch
vendored
Normal file
@ -0,0 +1,429 @@
|
||||
From e2d1ffb13e3909dab142f0f8ec8f934b79930717 Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Frascino <vincenzo.frascino@arm.com>
|
||||
Date: Mon, 14 Oct 2024 16:13:39 +0100
|
||||
Subject: vdso: Introduce vdso/page.h
|
||||
|
||||
The VDSO implementation includes headers from outside of the
|
||||
vdso/ namespace.
|
||||
|
||||
Introduce vdso/page.h to make sure that the generic library
|
||||
uses only the allowed namespace.
|
||||
|
||||
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
|
||||
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
||||
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
|
||||
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k
|
||||
Link: https://lore.kernel.org/all/20241014151340.1639555-3-vincenzo.frascino@arm.com
|
||||
---
|
||||
arch/alpha/include/asm/page.h | 6 +-----
|
||||
arch/arc/include/uapi/asm/page.h | 7 +++----
|
||||
arch/arm/include/asm/page.h | 5 +----
|
||||
arch/arm64/include/asm/page-def.h | 5 +----
|
||||
arch/csky/include/asm/page.h | 8 ++------
|
||||
arch/hexagon/include/asm/page.h | 4 +---
|
||||
arch/loongarch/include/asm/page.h | 7 +------
|
||||
arch/m68k/include/asm/page.h | 6 ++----
|
||||
arch/microblaze/include/asm/page.h | 5 +----
|
||||
arch/mips/include/asm/page.h | 7 +------
|
||||
arch/nios2/include/asm/page.h | 7 +------
|
||||
arch/openrisc/include/asm/page.h | 11 +----------
|
||||
arch/parisc/include/asm/page.h | 4 +---
|
||||
arch/powerpc/include/asm/page.h | 10 +---------
|
||||
arch/riscv/include/asm/page.h | 4 +---
|
||||
arch/s390/include/asm/page.h | 13 +++++--------
|
||||
arch/sh/include/asm/page.h | 6 ++----
|
||||
arch/sparc/include/asm/page_32.h | 4 +---
|
||||
arch/sparc/include/asm/page_64.h | 4 +---
|
||||
arch/um/include/asm/page.h | 5 +----
|
||||
arch/x86/include/asm/page_types.h | 5 +----
|
||||
arch/xtensa/include/asm/page.h | 8 +-------
|
||||
include/vdso/page.h | 30 ++++++++++++++++++++++++++++++
|
||||
23 files changed, 61 insertions(+), 110 deletions(-)
|
||||
create mode 100644 include/vdso/page.h
|
||||
|
||||
--- a/arch/alpha/include/asm/page.h
|
||||
+++ b/arch/alpha/include/asm/page.h
|
||||
@@ -4,11 +4,7 @@
|
||||
|
||||
#include <linux/const.h>
|
||||
#include <asm/pal.h>
|
||||
-
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
--- a/arch/arc/include/uapi/asm/page.h
|
||||
+++ b/arch/arc/include/uapi/asm/page.h
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
/* PAGE_SHIFT determines the page size */
|
||||
#ifdef __KERNEL__
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
+#include <vdso/page.h>
|
||||
#else
|
||||
/*
|
||||
* Default 8k
|
||||
@@ -24,11 +24,10 @@
|
||||
* not available
|
||||
*/
|
||||
#define PAGE_SHIFT 13
|
||||
+#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */
|
||||
+#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
#endif
|
||||
|
||||
-#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */
|
||||
#define PAGE_OFFSET _AC(0x80000000, UL) /* Kernel starts at 2G onwrds */
|
||||
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
-
|
||||
#endif /* _UAPI__ASM_ARC_PAGE_H */
|
||||
--- a/arch/arm/include/asm/page.h
|
||||
+++ b/arch/arm/include/asm/page.h
|
||||
@@ -7,10 +7,7 @@
|
||||
#ifndef _ASMARM_PAGE_H
|
||||
#define _ASMARM_PAGE_H
|
||||
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
--- a/arch/arm64/include/asm/page-def.h
|
||||
+++ b/arch/arm64/include/asm/page-def.h
|
||||
@@ -10,9 +10,6 @@
|
||||
|
||||
#include <linux/const.h>
|
||||
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#endif /* __ASM_PAGE_DEF_H */
|
||||
--- a/arch/csky/include/asm/page.h
|
||||
+++ b/arch/csky/include/asm/page.h
|
||||
@@ -7,12 +7,8 @@
|
||||
#include <asm/cache.h>
|
||||
#include <linux/const.h>
|
||||
|
||||
-/*
|
||||
- * PAGE_SHIFT determines the page size: 4KB
|
||||
- */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE - 1))
|
||||
+#include <vdso/page.h>
|
||||
+
|
||||
#define THREAD_SIZE (PAGE_SIZE * 2)
|
||||
#define THREAD_MASK (~(THREAD_SIZE - 1))
|
||||
#define THREAD_SHIFT (PAGE_SHIFT + 1)
|
||||
--- a/arch/hexagon/include/asm/page.h
|
||||
+++ b/arch/hexagon/include/asm/page.h
|
||||
@@ -45,9 +45,7 @@
|
||||
#define HVM_HUGEPAGE_SIZE 0x5
|
||||
#endif
|
||||
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (1UL << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#ifndef __ASSEMBLY__
|
||||
--- a/arch/loongarch/include/asm/page.h
|
||||
+++ b/arch/loongarch/include/asm/page.h
|
||||
@@ -8,12 +8,7 @@
|
||||
#include <linux/const.h>
|
||||
#include <asm/addrspace.h>
|
||||
|
||||
-/*
|
||||
- * PAGE_SHIFT determines the page size
|
||||
- */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE - 1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#define HPAGE_SHIFT (PAGE_SHIFT + PAGE_SHIFT - 3)
|
||||
#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
|
||||
--- a/arch/m68k/include/asm/page.h
|
||||
+++ b/arch/m68k/include/asm/page.h
|
||||
@@ -6,10 +6,8 @@
|
||||
#include <asm/setup.h>
|
||||
#include <asm/page_offset.h>
|
||||
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
+
|
||||
#define PAGE_OFFSET (PAGE_OFFSET_RAW)
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
--- a/arch/microblaze/include/asm/page.h
|
||||
+++ b/arch/microblaze/include/asm/page.h
|
||||
@@ -19,10 +19,7 @@
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#define LOAD_OFFSET ASM_CONST((CONFIG_KERNEL_START-CONFIG_KERNEL_BASE_ADDR))
|
||||
|
||||
--- a/arch/mips/include/asm/page.h
|
||||
+++ b/arch/mips/include/asm/page.h
|
||||
@@ -14,12 +14,7 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <asm/mipsregs.h>
|
||||
|
||||
-/*
|
||||
- * PAGE_SHIFT determines the page size
|
||||
- */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
/*
|
||||
* This is used for calculating the real page sizes
|
||||
--- a/arch/nios2/include/asm/page.h
|
||||
+++ b/arch/nios2/include/asm/page.h
|
||||
@@ -18,12 +18,7 @@
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/const.h>
|
||||
|
||||
-/*
|
||||
- * PAGE_SHIFT determines the page size
|
||||
- */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE - 1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
/*
|
||||
* PAGE_OFFSET -- the first address of the first page of memory.
|
||||
--- a/arch/openrisc/include/asm/page.h
|
||||
+++ b/arch/openrisc/include/asm/page.h
|
||||
@@ -15,16 +15,7 @@
|
||||
#ifndef __ASM_OPENRISC_PAGE_H
|
||||
#define __ASM_OPENRISC_PAGE_H
|
||||
|
||||
-
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#ifdef __ASSEMBLY__
|
||||
-#define PAGE_SIZE (1 << PAGE_SHIFT)
|
||||
-#else
|
||||
-#define PAGE_SIZE (1UL << PAGE_SHIFT)
|
||||
-#endif
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#define PAGE_OFFSET 0xc0000000
|
||||
#define KERNELBASE PAGE_OFFSET
|
||||
--- a/arch/parisc/include/asm/page.h
|
||||
+++ b/arch/parisc/include/asm/page.h
|
||||
@@ -4,9 +4,7 @@
|
||||
|
||||
#include <linux/const.h>
|
||||
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
|
||||
|
||||
--- a/arch/powerpc/include/asm/page.h
|
||||
+++ b/arch/powerpc/include/asm/page.h
|
||||
@@ -21,8 +21,7 @@
|
||||
* page size. When using 64K pages however, whether we are really supporting
|
||||
* 64K pages in HW or not is irrelevant to those definitions.
|
||||
*/
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT)
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#ifndef CONFIG_HUGETLB_PAGE
|
||||
@@ -42,13 +41,6 @@ extern unsigned int hpage_shift;
|
||||
#endif
|
||||
|
||||
/*
|
||||
- * Subtle: (1 << PAGE_SHIFT) is an int, not an unsigned long. So if we
|
||||
- * assign PAGE_MASK to a larger type it gets extended the way we want
|
||||
- * (i.e. with 1s in the high bits)
|
||||
- */
|
||||
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
|
||||
-
|
||||
-/*
|
||||
* KERNELBASE is the virtual address of the start of the kernel, it's often
|
||||
* the same as PAGE_OFFSET, but _might not be_.
|
||||
*
|
||||
--- a/arch/riscv/include/asm/page.h
|
||||
+++ b/arch/riscv/include/asm/page.h
|
||||
@@ -12,9 +12,7 @@
|
||||
#include <linux/pfn.h>
|
||||
#include <linux/const.h>
|
||||
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE - 1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#define HPAGE_SHIFT PMD_SHIFT
|
||||
#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
|
||||
--- a/arch/s390/include/asm/page.h
|
||||
+++ b/arch/s390/include/asm/page.h
|
||||
@@ -11,14 +11,11 @@
|
||||
#include <linux/const.h>
|
||||
#include <asm/types.h>
|
||||
|
||||
-#define _PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT)
|
||||
-#define _PAGE_MASK (~(_PAGE_SIZE - 1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-#define PAGE_SHIFT _PAGE_SHIFT
|
||||
-#define PAGE_SIZE _PAGE_SIZE
|
||||
-#define PAGE_MASK _PAGE_MASK
|
||||
+#define _PAGE_SHIFT PAGE_SHIFT
|
||||
+#define _PAGE_SIZE PAGE_SIZE
|
||||
+#define _PAGE_MASK PAGE_MASK
|
||||
#define PAGE_DEFAULT_ACC _AC(0, UL)
|
||||
/* storage-protection override */
|
||||
#define PAGE_SPO_ACC 9
|
||||
--- a/arch/sh/include/asm/page.h
|
||||
+++ b/arch/sh/include/asm/page.h
|
||||
@@ -8,10 +8,8 @@
|
||||
|
||||
#include <linux/const.h>
|
||||
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
+
|
||||
#define PTE_MASK PAGE_MASK
|
||||
|
||||
#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
|
||||
--- a/arch/sparc/include/asm/page_32.h
|
||||
+++ b/arch/sparc/include/asm/page_32.h
|
||||
@@ -11,9 +11,7 @@
|
||||
|
||||
#include <linux/const.h>
|
||||
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
--- a/arch/sparc/include/asm/page_64.h
|
||||
+++ b/arch/sparc/include/asm/page_64.h
|
||||
@@ -4,9 +4,7 @@
|
||||
|
||||
#include <linux/const.h>
|
||||
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
/* Flushing for D-cache alias handling is only needed if
|
||||
* the page size is smaller than 16K.
|
||||
--- a/arch/um/include/asm/page.h
|
||||
+++ b/arch/um/include/asm/page.h
|
||||
@@ -9,10 +9,7 @@
|
||||
|
||||
#include <linux/const.h>
|
||||
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
--- a/arch/x86/include/asm/page_types.h
|
||||
+++ b/arch/x86/include/asm/page_types.h
|
||||
@@ -6,10 +6,7 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/mem_encrypt.h>
|
||||
|
||||
-/* PAGE_SHIFT determines the page size */
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
|
||||
|
||||
--- a/arch/xtensa/include/asm/page.h
|
||||
+++ b/arch/xtensa/include/asm/page.h
|
||||
@@ -18,13 +18,7 @@
|
||||
#include <asm/cache.h>
|
||||
#include <asm/kmem_layout.h>
|
||||
|
||||
-/*
|
||||
- * PAGE_SHIFT determines the page size
|
||||
- */
|
||||
-
|
||||
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
-#define PAGE_SIZE (__XTENSA_UL_CONST(1) << PAGE_SHIFT)
|
||||
-#define PAGE_MASK (~(PAGE_SIZE-1))
|
||||
+#include <vdso/page.h>
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
#define PAGE_OFFSET XCHAL_KSEG_CACHED_VADDR
|
||||
--- /dev/null
|
||||
+++ b/include/vdso/page.h
|
||||
@@ -0,0 +1,30 @@
|
||||
+/* SPDX-License-Identifier: GPL-2.0 */
|
||||
+#ifndef __VDSO_PAGE_H
|
||||
+#define __VDSO_PAGE_H
|
||||
+
|
||||
+#include <uapi/linux/const.h>
|
||||
+
|
||||
+/*
|
||||
+ * PAGE_SHIFT determines the page size.
|
||||
+ *
|
||||
+ * Note: This definition is required because PAGE_SHIFT is used
|
||||
+ * in several places throuout the codebase.
|
||||
+ */
|
||||
+#define PAGE_SHIFT CONFIG_PAGE_SHIFT
|
||||
+
|
||||
+#define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT)
|
||||
+
|
||||
+#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT)
|
||||
+/*
|
||||
+ * Applies only to 32-bit architectures with a 64-bit phys_addr_t.
|
||||
+ *
|
||||
+ * Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long.
|
||||
+ * So if we assign PAGE_MASK to a larger type it gets extended the
|
||||
+ * way we want (i.e. with 1s in the high bits)
|
||||
+ */
|
||||
+#define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1))
|
||||
+#else
|
||||
+#define PAGE_MASK (~(PAGE_SIZE - 1))
|
||||
+#endif
|
||||
+
|
||||
+#endif /* __VDSO_PAGE_H */
|
@ -0,0 +1,68 @@
|
||||
From 4478ee194402472199e05d3e27a87f0fc775cc18 Mon Sep 17 00:00:00 2001
|
||||
From: Arnd Bergmann <arnd@arndb.de>
|
||||
Date: Thu, 24 Oct 2024 13:34:26 +0000
|
||||
Subject: vdso: Change PAGE_MASK to signed on all 32-bit architectures
|
||||
|
||||
With the introduction of an architecture-independent defintion of
|
||||
PAGE_MASK, we had to make a choice between defining it as 'unsigned long'
|
||||
as on 64-bit architectures, or as signed 'long' as required for
|
||||
architectures with a 64-bit phys_addr_t.
|
||||
|
||||
To reduce the risk for regressions and minimize the changes in behavior,
|
||||
the result was using the signed value only when CONFIG_PHYS_ADDR_T_64BIT
|
||||
is set, but that ended up causing a regression after all in the
|
||||
early_init_dt_add_memory_arch() function that uses 64-bit integers for
|
||||
address calculation.
|
||||
|
||||
Presumably the same regression also affects mips32 and powerpc32 when
|
||||
dealing with large amounts of memory on DT platforms: like arm32, they were
|
||||
using the signed version unconditionally.
|
||||
|
||||
The two most sensible options for addressing the regression are either to
|
||||
go back to an architecture specific definition, using a signed constant on
|
||||
arm/powerpc/mips and unsigned on the others, or to use the same definition
|
||||
everywhere.
|
||||
|
||||
Use the simpler of those two and change them all to the signed version, in
|
||||
the hope that this does not cause a different type of bug. Most of the
|
||||
other 32-bit architectures have no large physical address support and are
|
||||
rarely used, so it seems more likely that using the same definition helps
|
||||
than hurts here.
|
||||
|
||||
In particular, x86-32 does have physical addressing extensions, so it
|
||||
already changed to the signed version after the previous patch, so it makes
|
||||
sense to use the same version on non-PAE as well.
|
||||
|
||||
Fixes: efe8419ae78d ("vdso: Introduce vdso/page.h")
|
||||
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
|
||||
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
|
||||
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
||||
Tested-by: Anders Roxell <anders.roxell@linaro.org>
|
||||
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
|
||||
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
|
||||
Link: https://lore.kernel.org/all/20241024133447.3117273-1-arnd@kernel.org
|
||||
Link: https://lore.kernel.org/lkml/CA+G9fYt86bUAu_v5dXPWnDUwQNVipj+Wq3Djir1KUSKdr9QLNg@mail.gmail.com/
|
||||
---
|
||||
include/vdso/page.h | 7 ++++---
|
||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/include/vdso/page.h
|
||||
+++ b/include/vdso/page.h
|
||||
@@ -14,13 +14,14 @@
|
||||
|
||||
#define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT)
|
||||
|
||||
-#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT)
|
||||
+#if !defined(CONFIG_64BIT)
|
||||
/*
|
||||
- * Applies only to 32-bit architectures with a 64-bit phys_addr_t.
|
||||
+ * Applies only to 32-bit architectures.
|
||||
*
|
||||
* Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long.
|
||||
* So if we assign PAGE_MASK to a larger type it gets extended the
|
||||
- * way we want (i.e. with 1s in the high bits)
|
||||
+ * way we want (i.e. with 1s in the high bits) while masking a
|
||||
+ * 64-bit value such as phys_addr_t.
|
||||
*/
|
||||
#define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1))
|
||||
#else
|
@ -102,7 +102,7 @@ Contains:
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -6353,7 +6353,7 @@ retry:
|
||||
@@ -6384,7 +6384,7 @@ retry:
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -111,7 +111,7 @@ Contains:
|
||||
{
|
||||
struct zone *zone;
|
||||
unsigned long pfmemalloc_reserve = 0;
|
||||
@@ -6382,6 +6382,10 @@ static bool allow_direct_reclaim(pg_data
|
||||
@@ -6413,6 +6413,10 @@ static bool allow_direct_reclaim(pg_data
|
||||
|
||||
wmark_ok = free_pages > pfmemalloc_reserve / 2;
|
||||
|
||||
@ -122,7 +122,7 @@ Contains:
|
||||
/* kswapd must be awake if processes are being throttled */
|
||||
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
|
||||
if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
|
||||
@@ -6447,7 +6451,7 @@ static bool throttle_direct_reclaim(gfp_
|
||||
@@ -6478,7 +6482,7 @@ static bool throttle_direct_reclaim(gfp_
|
||||
|
||||
/* Throttle based on the first usable node */
|
||||
pgdat = zone->zone_pgdat;
|
||||
@ -131,7 +131,7 @@ Contains:
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
@@ -6469,11 +6473,14 @@ static bool throttle_direct_reclaim(gfp_
|
||||
@@ -6500,11 +6504,14 @@ static bool throttle_direct_reclaim(gfp_
|
||||
*/
|
||||
if (!(gfp_mask & __GFP_FS))
|
||||
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
|
||||
@ -148,7 +148,7 @@ Contains:
|
||||
|
||||
if (fatal_signal_pending(current))
|
||||
return true;
|
||||
@@ -6976,14 +6983,14 @@ restart:
|
||||
@@ -7007,14 +7014,14 @@ restart:
|
||||
* able to safely make forward progress. Wake them
|
||||
*/
|
||||
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
|
||||
|
194
debian/patches/patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch
vendored
Normal file
194
debian/patches/patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch
vendored
Normal file
@ -0,0 +1,194 @@
|
||||
From eacae6d88bcc8a925124f97b7788bb2bfac8b267 Mon Sep 17 00:00:00 2001
|
||||
From: Vinay Banakar <vny@google.com>
|
||||
Date: Mon, 20 Jan 2025 16:47:29 -0600
|
||||
Subject: mm: Optimize TLB flushes during page reclaim
|
||||
|
||||
The current implementation in shrink_folio_list() performs full TLB
|
||||
flushes and issues IPIs for each individual page being reclaimed. This
|
||||
causes unnecessary overhead during memory reclaim, whether triggered
|
||||
by madvise(MADV_PAGEOUT) or kswapd, especially in scenarios where
|
||||
applications are actively moving cold pages to swap while maintaining
|
||||
high performance requirements for hot pages.
|
||||
|
||||
The current code:
|
||||
1. Clears PTE and unmaps each page individually
|
||||
2. Performs a full TLB flush on all cores using the VMA (via CR3 write) or
|
||||
issues individual TLB shootdowns (invlpg+invlpcid) for single-core usage
|
||||
3. Submits each page individually to BIO
|
||||
|
||||
This approach results in:
|
||||
- Excessive full TLB flushes across all cores
|
||||
- Unnecessary IPI storms when processing multiple pages
|
||||
- Suboptimal I/O submission patterns
|
||||
|
||||
I initially tried using selective TLB shootdowns (invlpg) instead of
|
||||
full TLB flushes per each page to avoid interference with other
|
||||
threads. However, this approach still required sending IPIs to all
|
||||
cores for each page, which did not significantly improve application
|
||||
throughput.
|
||||
|
||||
This patch instead optimizes the process by batching operations,
|
||||
issuing one IPI per PMD instead of per page. This reduces interrupts
|
||||
by a factor of 512 and enables batching page submissions to BIO. The
|
||||
new approach:
|
||||
1. Collect dirty pages that need to be written back
|
||||
2. Issue a single TLB flush for all dirty pages in the batch
|
||||
3. Process the collected pages for writebacks (submit to BIO)
|
||||
|
||||
Testing shows significant reduction in application throughput impact
|
||||
during page-out operations. Applications maintain better performance
|
||||
during memory reclaim, when triggered by explicit
|
||||
madvise(MADV_PAGEOUT) calls.
|
||||
|
||||
I'd appreciate your feedback on this approach, especially on the
|
||||
correctness of batched BIO submissions. Looking forward to your
|
||||
comments.
|
||||
|
||||
Signed-off-by: Vinay Banakar <vny@google.com>
|
||||
---
|
||||
mm/vmscan.c | 120 ++++++++++++++++++++++++++++++++--------------------
|
||||
1 file changed, 74 insertions(+), 46 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -1053,6 +1053,7 @@ static unsigned int shrink_folio_list(st
|
||||
struct folio_batch free_folios;
|
||||
LIST_HEAD(ret_folios);
|
||||
LIST_HEAD(demote_folios);
|
||||
+ LIST_HEAD(pageout_list);
|
||||
unsigned int nr_reclaimed = 0;
|
||||
unsigned int pgactivate = 0;
|
||||
bool do_demote_pass;
|
||||
@@ -1365,52 +1366,9 @@ retry:
|
||||
if (!sc->may_writepage)
|
||||
goto keep_locked;
|
||||
|
||||
- /*
|
||||
- * Folio is dirty. Flush the TLB if a writable entry
|
||||
- * potentially exists to avoid CPU writes after I/O
|
||||
- * starts and then write it out here.
|
||||
- */
|
||||
- try_to_unmap_flush_dirty();
|
||||
- switch (pageout(folio, mapping, &plug, folio_list)) {
|
||||
- case PAGE_KEEP:
|
||||
- goto keep_locked;
|
||||
- case PAGE_ACTIVATE:
|
||||
- /*
|
||||
- * If shmem folio is split when writeback to swap,
|
||||
- * the tail pages will make their own pass through
|
||||
- * this function and be accounted then.
|
||||
- */
|
||||
- if (nr_pages > 1 && !folio_test_large(folio)) {
|
||||
- sc->nr_scanned -= (nr_pages - 1);
|
||||
- nr_pages = 1;
|
||||
- }
|
||||
- goto activate_locked;
|
||||
- case PAGE_SUCCESS:
|
||||
- if (nr_pages > 1 && !folio_test_large(folio)) {
|
||||
- sc->nr_scanned -= (nr_pages - 1);
|
||||
- nr_pages = 1;
|
||||
- }
|
||||
- stat->nr_pageout += nr_pages;
|
||||
-
|
||||
- if (folio_test_writeback(folio))
|
||||
- goto keep;
|
||||
- if (folio_test_dirty(folio))
|
||||
- goto keep;
|
||||
-
|
||||
- /*
|
||||
- * A synchronous write - probably a ramdisk. Go
|
||||
- * ahead and try to reclaim the folio.
|
||||
- */
|
||||
- if (!folio_trylock(folio))
|
||||
- goto keep;
|
||||
- if (folio_test_dirty(folio) ||
|
||||
- folio_test_writeback(folio))
|
||||
- goto keep_locked;
|
||||
- mapping = folio_mapping(folio);
|
||||
- fallthrough;
|
||||
- case PAGE_CLEAN:
|
||||
- ; /* try to free the folio below */
|
||||
- }
|
||||
+ /* Add to pageout list for defered bio submissions */
|
||||
+ list_add(&folio->lru, &pageout_list);
|
||||
+ continue;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1521,6 +1479,76 @@ keep:
|
||||
}
|
||||
/* 'folio_list' is always empty here */
|
||||
|
||||
+ if (!list_empty(&pageout_list)) {
|
||||
+ /*
|
||||
+ * Batch TLB flushes by flushing once before processing all dirty pages.
|
||||
+ * Since we operate on one PMD at a time, this batches TLB flushes at
|
||||
+ * PMD granularity rather than per-page, reducing IPIs.
|
||||
+ */
|
||||
+ struct address_space *mapping;
|
||||
+ try_to_unmap_flush_dirty();
|
||||
+
|
||||
+ while (!list_empty(&pageout_list)) {
|
||||
+ struct folio *folio = lru_to_folio(&pageout_list);
|
||||
+ list_del(&folio->lru);
|
||||
+
|
||||
+ /* Recheck if page got reactivated */
|
||||
+ if (folio_test_active(folio) ||
|
||||
+ (folio_mapped(folio) && folio_test_young(folio)))
|
||||
+ goto skip_pageout_locked;
|
||||
+
|
||||
+ mapping = folio_mapping(folio);
|
||||
+ pageout_t pageout_res = pageout(folio, mapping, &plug);
|
||||
+ switch (pageout_res) {
|
||||
+ case PAGE_KEEP:
|
||||
+ goto skip_pageout_locked;
|
||||
+ case PAGE_ACTIVATE:
|
||||
+ goto skip_pageout_locked;
|
||||
+ case PAGE_SUCCESS:
|
||||
+ stat->nr_pageout += folio_nr_pages(folio);
|
||||
+
|
||||
+ if (folio_test_writeback(folio) ||
|
||||
+ folio_test_dirty(folio))
|
||||
+ goto skip_pageout;
|
||||
+
|
||||
+ /*
|
||||
+ * A synchronous write - probably a ramdisk. Go
|
||||
+ * ahead and try to reclaim the folio.
|
||||
+ */
|
||||
+ if (!folio_trylock(folio))
|
||||
+ goto skip_pageout;
|
||||
+ if (folio_test_dirty(folio) ||
|
||||
+ folio_test_writeback(folio))
|
||||
+ goto skip_pageout_locked;
|
||||
+
|
||||
+ // Try to free the page
|
||||
+ if (!mapping ||
|
||||
+ !__remove_mapping(mapping, folio, true,
|
||||
+ sc->target_mem_cgroup))
|
||||
+ goto skip_pageout_locked;
|
||||
+
|
||||
+ nr_reclaimed += folio_nr_pages(folio);
|
||||
+ folio_unlock(folio);
|
||||
+ continue;
|
||||
+
|
||||
+ case PAGE_CLEAN:
|
||||
+ if (!mapping ||
|
||||
+ !__remove_mapping(mapping, folio, true,
|
||||
+ sc->target_mem_cgroup))
|
||||
+ goto skip_pageout_locked;
|
||||
+
|
||||
+ nr_reclaimed += folio_nr_pages(folio);
|
||||
+ folio_unlock(folio);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+skip_pageout_locked:
|
||||
+ folio_unlock(folio);
|
||||
+skip_pageout:
|
||||
+ list_add(&folio->lru, &ret_folios);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
/* Migrate folios selected for demotion */
|
||||
stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
|
||||
nr_reclaimed += stat->nr_demoted;
|
19
debian/patches/series
vendored
19
debian/patches/series
vendored
@ -47,7 +47,6 @@ features/x86/intel-iommu-add-option-to-exclude-integrated-gpu-only.patch
|
||||
features/x86/intel-iommu-add-kconfig-option-to-exclude-igpu-by-default.patch
|
||||
|
||||
# Disable autoloading/probing of various drivers by default
|
||||
debian/cdc_ncm-cdc_mbim-use-ncm-by-default.patch
|
||||
debian/snd-pcsp-disable-autoload.patch
|
||||
bugfix/x86/viafb-autoload-on-olpc-xo1.5-only.patch
|
||||
debian/fjes-disable-autoload.patch
|
||||
@ -203,6 +202,24 @@ patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch
|
||||
patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch
|
||||
patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch
|
||||
|
||||
patchset-zen/nvlpgb-v7/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
|
||||
patchset-zen/nvlpgb-v7/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
|
||||
patchset-zen/nvlpgb-v7/0003-x86-mm-consolidate-full-flush-threshold-decision.patch
|
||||
patchset-zen/nvlpgb-v7/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
|
||||
patchset-zen/nvlpgb-v7/0005-x86-mm-add-INVLPGB-support-code.patch
|
||||
patchset-zen/nvlpgb-v7/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
|
||||
patchset-zen/nvlpgb-v7/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch
|
||||
patchset-zen/nvlpgb-v7/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
|
||||
patchset-zen/nvlpgb-v7/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
|
||||
patchset-zen/nvlpgb-v7/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch
|
||||
patchset-zen/nvlpgb-v7/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
|
||||
patchset-zen/nvlpgb-v7/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
|
||||
patchset-zen/nvlpgb-v7/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
|
||||
patchset-zen/nvlpgb-v7/0014-vdso-Introduce-vdso-page.h.patch
|
||||
patchset-zen/nvlpgb-v7/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch
|
||||
|
||||
patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch
|
||||
|
||||
patchset-xanmod/xanmod/0001-kbuild-Remove-GCC-minimal-function-alignment.patch
|
||||
patchset-xanmod/xanmod/0002-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch
|
||||
patchset-xanmod/xanmod/0003-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch
|
||||
|
Loading…
x
Reference in New Issue
Block a user