1
0

release 6.12.11

This commit is contained in:
Konstantin Demin 2025-01-28 09:25:42 +03:00
parent 5b35bedab1
commit dbf57fda83
25 changed files with 2351 additions and 39 deletions

7
debian/changelog vendored
View File

@ -1,3 +1,10 @@
linux (6.12.11-1) sid; urgency=medium
* New upstream stable update:
https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.12.11
-- Konstantin Demin <rockdrilla@gmail.com> Tue, 28 Jan 2025 09:10:17 +0300
linux (6.12.10-1) sid; urgency=medium
* New upstream stable update:

View File

@ -2491,8 +2491,6 @@ CONFIG_KEXEC_CORE=y
CONFIG_LZ4HC_COMPRESS=m
CONFIG_LZ4_COMPRESS=m
CONFIG_MFD_CORE=m
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
CONFIG_MMU_GATHER_TABLE_FREE=y
CONFIG_ND_BTT=m
CONFIG_ND_PFN=m
CONFIG_NETFS_SUPPORT=m

View File

@ -4064,8 +4064,6 @@ CONFIG_LZ4_COMPRESS=m
CONFIG_MAPPING_DIRTY_HELPERS=y
CONFIG_MCTP_FLOWS=y
CONFIG_MFD_CORE=m
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
CONFIG_MMU_GATHER_TABLE_FREE=y
CONFIG_MOUSE_PS2_SMBUS=y
CONFIG_ND_BTT=m
CONFIG_ND_PFN=m

View File

@ -3945,6 +3945,8 @@ CONFIG_MLX4_CORE=m
CONFIG_MMCONF_FAM10H=y
CONFIG_MMU=y
CONFIG_MMU_GATHER_MERGE_VMAS=y
CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
CONFIG_MMU_GATHER_TABLE_FREE=y
CONFIG_MMU_LAZY_TLB_REFCOUNT=y
CONFIG_MMU_NOTIFIER=y
CONFIG_MODULES_TREE_LOOKUP=y
@ -4186,6 +4188,7 @@ CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m
CONFIG_X86=y
CONFIG_X86_64=y
CONFIG_X86_64_SMP=y
CONFIG_X86_BROADCAST_TLB_FLUSH=y
CONFIG_X86_CMOV=y
CONFIG_X86_CMPXCHG64=y
CONFIG_X86_DEBUGCTLMSR=y

View File

@ -1,27 +0,0 @@
From: Ben Hutchings <ben@decadent.org.uk>
Subject: cdc_ncm,cdc_mbim: Use NCM by default
Date: Sun, 31 Mar 2013 03:58:04 +0100
Forwarded: not-needed
Devices that support both NCM and MBIM modes should be kept in NCM
mode unless there is userland support for MBIM.
Set the default value of cdc_ncm.prefer_mbim to false and leave it to
userland (modem-manager) to override this with a modprobe.conf file
once it's ready to speak MBIM.
---
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -54,11 +54,7 @@
#include <linux/usb/cdc.h>
#include <linux/usb/cdc_ncm.h>
-#if IS_ENABLED(CONFIG_USB_NET_CDC_MBIM)
-static bool prefer_mbim = true;
-#else
static bool prefer_mbim;
-#endif
module_param(prefer_mbim, bool, 0644);
MODULE_PARM_DESC(prefer_mbim, "Prefer MBIM setting on dual NCM/MBIM functions");

View File

@ -22,7 +22,7 @@ Export the currently un-exported symbols it depends on.
--- a/fs/file.c
+++ b/fs/file.c
@@ -792,6 +792,7 @@ struct file *file_close_fd(unsigned int
@@ -793,6 +793,7 @@ struct file *file_close_fd(unsigned int
return file;
}

View File

@ -47,7 +47,7 @@ Signed-off-by: Christian Loehle <christian.loehle@arm.com>
.reflect = menu_reflect,
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -537,7 +537,7 @@ static int teo_enable_device(struct cpui
@@ -542,7 +542,7 @@ static int teo_enable_device(struct cpui
static struct cpuidle_governor teo_governor = {
.name = "teo",

View File

@ -0,0 +1,123 @@
From 6cb30d7518301094b9c7397a24a22cf538a1d64c Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:20 -0500
Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional
Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
paravirt, and not when running on bare metal.
There is no real good reason to do things differently for
each setup. Make them all the same.
Currently get_user_pages_fast synchronizes against page table
freeing in two different ways:
- on bare metal, by blocking IRQs, which block TLB flush IPIs
- on paravirt, with MMU_GATHER_RCU_TABLE_FREE
This is done because some paravirt TLB flush implementations
handle the TLB flush in the hypervisor, and will do the flush
even when the target CPU has interrupts disabled.
Always handle page table freeing with MMU_GATHER_RCU_TABLE_FREE.
Using RCU synchronization between page table freeing and get_user_pages_fast()
allows bare metal to also do TLB flushing while interrupts are disabled.
Various places in the mm do still block IRQs or disable preemption
as an implicit way to block RCU frees.
That makes it safe to use INVLPGB on AMD CPUs.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
---
arch/x86/Kconfig | 2 +-
arch/x86/kernel/paravirt.c | 7 +------
arch/x86/mm/pgtable.c | 16 ++++------------
3 files changed, 6 insertions(+), 19 deletions(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -270,7 +270,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
+ select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void)
static_branch_enable(&virt_spin_lock_key);
}
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_page(tlb, table);
-}
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops =
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table = native_tlb_remove_table,
+ .mmu.tlb_remove_table = tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask);
#define PGTABLE_HIGHMEM 0
#endif
-#ifndef CONFIG_PARAVIRT
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_page(tlb, table);
-}
-#endif
-
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
pgtable_t pte_alloc_one(struct mm_struct *mm)
@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather *
{
pagetable_pte_dtor(page_ptdesc(pte));
paravirt_release_pte(page_to_pfn(pte));
- paravirt_tlb_remove_table(tlb, pte);
+ tlb_remove_table(tlb, pte);
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather *
tlb->need_flush_all = 1;
#endif
pagetable_pmd_dtor(ptdesc);
- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
+ tlb_remove_table(tlb, ptdesc_page(ptdesc));
}
#if CONFIG_PGTABLE_LEVELS > 3
@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather *
pagetable_pud_dtor(ptdesc);
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_page(pud));
+ tlb_remove_table(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
+ tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */

View File

@ -0,0 +1,84 @@
From df8f812b62c450b98b972ad0a4d5a5ba400f5eae Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:21 -0500
Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call
Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
Get rid of the indirection by simply calling tlb_remove_table directly,
and not going through the paravirt function pointers.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
arch/x86/hyperv/mmu.c | 1 -
arch/x86/include/asm/paravirt.h | 5 -----
arch/x86/include/asm/paravirt_types.h | 2 --
arch/x86/kernel/kvm.c | 1 -
arch/x86/kernel/paravirt.c | 1 -
arch/x86/xen/mmu_pv.c | 1 -
6 files changed, 11 deletions(-)
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void)
pr_info("Using hypercall for remote TLB flush\n");
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
}
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
}
-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
-}
-
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
PVOP_VCALL1(mmu.exit_mmap, mm);
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -136,8 +136,6 @@ struct pv_mmu_ops {
void (*flush_tlb_multi)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
-
/* Hook for intercepting the destruction of an mm_struct. */
void (*exit_mmap)(struct mm_struct *mm);
void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
if (pv_tlb_flush_supported()) {
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
pr_info("KVM setup pv remote TLB flush\n");
}
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops =
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
- .mmu.tlb_remove_table = tlb_remove_table,
.mmu.exit_mmap = paravirt_nop,
.mmu.notify_page_enc_status_changed = paravirt_nop,
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2137,7 +2137,6 @@ static const typeof(pv_ops) xen_mmu_ops
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_one_user = xen_flush_tlb_one_user,
.flush_tlb_multi = xen_flush_tlb_multi,
- .tlb_remove_table = tlb_remove_table,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,

View File

@ -0,0 +1,93 @@
From 8b2bd3f69b50cfe59eee4506413715878bcbb901 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:22 -0500
Subject: x86/mm: consolidate full flush threshold decision
Reduce code duplication by consolidating the decision point
for whether to do individual invalidations or a full flush
inside get_flush_tlb_info.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
---
arch/x86/mm/tlb.c | 43 ++++++++++++++++++++-----------------------
1 file changed, 20 insertions(+), 23 deletions(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -981,6 +981,15 @@ static struct flush_tlb_info *get_flush_
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
+ /*
+ * If the number of flushes is so large that a full flush
+ * would be faster, do a full flush.
+ */
+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+ info->start = 0;
+ info->end = TLB_FLUSH_ALL;
+ }
+
return info;
}
@@ -998,17 +1007,8 @@ void flush_tlb_mm_range(struct mm_struct
bool freed_tables)
{
struct flush_tlb_info *info;
+ int cpu = get_cpu();
u64 new_tlb_gen;
- int cpu;
-
- cpu = get_cpu();
-
- /* Should we flush just the requested range? */
- if ((end == TLB_FLUSH_ALL) ||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
- start = 0;
- end = TLB_FLUSH_ALL;
- }
/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1060,22 +1060,19 @@ static void do_kernel_range_flush(void *
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- /* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
- on_each_cpu(do_flush_tlb_all, NULL, 1);
- } else {
- struct flush_tlb_info *info;
+ struct flush_tlb_info *info;
- preempt_disable();
- info = get_flush_tlb_info(NULL, start, end, 0, false,
- TLB_GENERATION_INVALID);
+ guard(preempt)();
+
+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+ TLB_GENERATION_INVALID);
+ if (info->end == TLB_FLUSH_ALL)
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ else
on_each_cpu(do_kernel_range_flush, info, 1);
- put_flush_tlb_info();
- preempt_enable();
- }
+ put_flush_tlb_info();
}
/*
@@ -1247,7 +1244,7 @@ void arch_tlbbatch_flush(struct arch_tlb
int cpu = get_cpu();
- info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
+ info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, PAGE_SHIFT, false,
TLB_GENERATION_INVALID);
/*
* flush_tlb_multi() is not optimized for the common case in which only

View File

@ -0,0 +1,89 @@
From a182b0471ba3c3329d93abfa07e3d452183a9137 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:23 -0500
Subject: x86/mm: get INVLPGB count max from CPUID
The CPU advertises the maximum number of pages that can be shot down
with one INVLPGB instruction in the CPUID data.
Save that information for later use.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/Kconfig.cpu | 5 +++++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/tlbflush.h | 7 +++++++
arch/x86/kernel/cpu/amd.c | 8 ++++++++
4 files changed, 21 insertions(+)
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -726,6 +726,10 @@ config X86_VMX_FEATURE_NAMES
def_bool y
depends on IA32_FEAT_CTL
+config X86_BROADCAST_TLB_FLUSH
+ def_bool y
+ depends on CPU_SUP_AMD && 64BIT
+
menuconfig PROCESSOR_SELECT
bool "Supported processor vendors" if EXPERT
help
@@ -762,6 +766,7 @@ config CPU_SUP_CYRIX_32
config CPU_SUP_AMD
default y
bool "Support AMD processors" if PROCESSOR_SELECT
+ select X86_BROADCAST_TLB_FLUSH
help
This enables detection, tunings and quirks for AMD processors
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -335,6 +335,7 @@
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instruction supported. */
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -183,6 +183,13 @@ static inline void cr4_init_shadow(void)
extern unsigned long mmu_cr4_features;
extern u32 *trampoline_cr4_features;
+/* How many pages can we invalidate with one INVLPGB. */
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+extern u16 invlpgb_count_max;
+#else
+#define invlpgb_count_max 1
+#endif
+
extern void initialize_tlbstate_and_flush(void);
/*
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -29,6 +29,8 @@
#include "cpu.h"
+u16 invlpgb_count_max __ro_after_init;
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -1135,6 +1137,12 @@ static void cpu_detect_tlb_amd(struct cp
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (boot_cpu_has(X86_FEATURE_INVLPGB)) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+ invlpgb_count_max = (edx & 0xffff) + 1;
+ }
}
static const struct cpu_dev amd_cpu_dev = {

View File

@ -0,0 +1,129 @@
From cc3f8dd3033c79abd9f37a94efed74a535a703c9 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:24 -0500
Subject: x86/mm: add INVLPGB support code
Add invlpgb.h with the helper functions and definitions needed to use
broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/invlpgb.h | 101 ++++++++++++++++++++++++++++++++
arch/x86/include/asm/tlbflush.h | 1 +
2 files changed, 102 insertions(+)
create mode 100644 arch/x86/include/asm/invlpgb.h
--- /dev/null
+++ b/arch/x86/include/asm/invlpgb.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVLPGB
+#define _ASM_X86_INVLPGB
+
+#include <linux/kernel.h>
+#include <vdso/bits.h>
+#include <vdso/page.h>
+
+/*
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+ *
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+ * be done in a parallel fashion.
+ *
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+ * this CPU have completed.
+ */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 extra_count,
+ bool pmd_stride, u8 flags)
+{
+ u32 edx = (pcid << 16) | asid;
+ u32 ecx = (pmd_stride << 31) | extra_count;
+ u64 rax = addr | flags;
+
+ /* The low bits in rax are for flags. Verify addr is clean. */
+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
+
+ /* INVLPGB; supported in binutils >= 2.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xfe" : : "a" (rax), "c" (ecx), "d" (edx));
+}
+
+/* Wait for INVLPGB originated by this CPU to complete. */
+static inline void tlbsync(void)
+{
+ cant_migrate();
+ /* TLBSYNC: supported in binutils >= 0.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
+}
+
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - INVLPGB_PCID: invalidate all TLB entries matching the PCID
+ *
+ * The first can be used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_VA BIT(0)
+#define INVLPGB_PCID BIT(1)
+#define INVLPGB_ASID BIT(2)
+#define INVLPGB_INCLUDE_GLOBAL BIT(3)
+#define INVLPGB_FINAL_ONLY BIT(4)
+#define INVLPGB_INCLUDE_NESTED BIT(5)
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invlpgb_flush_user(unsigned long pcid,
+ unsigned long addr)
+{
+ __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
+ tlbsync();
+}
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ unsigned long addr,
+ u16 nr,
+ bool pmd_stride)
+{
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+ __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invlpgb_flush_all(void)
+{
+ __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
+ tlbsync();
+}
+
+/* Flush addr, including globals, for all PCIDs. */
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+ __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invlpgb_flush_all_nonglobals(void)
+{
+ __invlpgb(0, 0, 0, 0, 0, 0);
+ tlbsync();
+}
+
+#endif /* _ASM_X86_INVLPGB */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,6 +10,7 @@
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
#include <asm/smp.h>
+#include <asm/invlpgb.h>
#include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>

View File

@ -0,0 +1,58 @@
From 6b6686f0d7e228d0a2d8c166204adea5484c20d7 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:25 -0500
Subject: x86/mm: use INVLPGB for kernel TLB flushes
Use broadcast TLB invalidation for kernel addresses when available.
Remove the need to send IPIs for kernel TLB flushes.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/mm/tlb.c | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1048,6 +1048,30 @@ void flush_tlb_all(void)
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
+static bool broadcast_kernel_range_flush(struct flush_tlb_info *info)
+{
+ unsigned long addr;
+ unsigned long nr;
+
+ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH))
+ return false;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ if (info->end == TLB_FLUSH_ALL) {
+ invlpgb_flush_all();
+ return true;
+ }
+
+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+ nr = min((info->end - addr) >> PAGE_SHIFT, invlpgb_count_max);
+ invlpgb_flush_addr_nosync(addr, nr);
+ }
+ tlbsync();
+ return true;
+}
+
static void do_kernel_range_flush(void *info)
{
struct flush_tlb_info *f = info;
@@ -1067,7 +1091,9 @@ void flush_tlb_kernel_range(unsigned lon
info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
TLB_GENERATION_INVALID);
- if (info->end == TLB_FLUSH_ALL)
+ if (broadcast_kernel_range_flush(info))
+ ; /* Fall through. */
+ else if (info->end == TLB_FLUSH_ALL)
on_each_cpu(do_flush_tlb_all, NULL, 1);
else
on_each_cpu(do_kernel_range_flush, info, 1);

View File

@ -0,0 +1,44 @@
From 6cffce503223f9076a5e16177905ba3ab6d9f7d8 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:26 -0500
Subject: x86/mm: use INVLPGB in flush_tlb_all
The flush_tlb_all() function is not used a whole lot, but we might
as well use broadcast TLB flushing there, too.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/mm/tlb.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1036,6 +1036,19 @@ void flush_tlb_mm_range(struct mm_struct
}
+static bool broadcast_flush_tlb_all(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH))
+ return false;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ guard(preempt)();
+ invlpgb_flush_all();
+ return true;
+}
+
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -1044,6 +1057,8 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
+ if (broadcast_flush_tlb_all())
+ return;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}

View File

@ -0,0 +1,29 @@
From 3d23d79d14cdd3c68dc5bffbaf34a60eaca7fa40 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:27 -0500
Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing
In the page reclaim code, we only track the CPU(s) where the TLB needs
to be flushed, rather than all the individual mappings that may be getting
invalidated.
Use broadcast TLB flushing when that is available.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/mm/tlb.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1292,7 +1292,9 @@ void arch_tlbbatch_flush(struct arch_tlb
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ invlpgb_flush_all_nonglobals();
+ } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();

View File

@ -0,0 +1,602 @@
From 79c9df0c7637c8ba8a1833889a2ace355d56c96e Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:28 -0500
Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded
processes
Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3
and newer CPUs.
In order to not exhaust PCID space, and keep TLB flushes local for single
threaded processes, we only hand out broadcast ASIDs to processes active on
3 or more CPUs, and gradually increase the threshold as broadcast ASID space
is depleted.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/mmu.h | 6 +
arch/x86/include/asm/mmu_context.h | 14 ++
arch/x86/include/asm/tlbflush.h | 73 ++++++
arch/x86/mm/tlb.c | 344 ++++++++++++++++++++++++++++-
4 files changed, 425 insertions(+), 12 deletions(-)
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -67,6 +67,12 @@ typedef struct {
u16 pkey_allocation_map;
s16 execute_only_pkey;
#endif
+
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+ u16 global_asid;
+ bool asid_transition;
+#endif
+
} mm_context_t;
#define INIT_MM_CONTEXT(mm) \
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+extern void destroy_context_free_global_asid(struct mm_struct *mm);
+
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
@@ -160,6 +162,14 @@ static inline int init_new_context(struc
mm->context.execute_only_pkey = -1;
}
#endif
+
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ mm->context.global_asid = 0;
+ mm->context.asid_transition = false;
+ }
+#endif
+
mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
@@ -169,6 +179,10 @@ static inline int init_new_context(struc
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ destroy_context_free_global_asid(mm);
+#endif
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -6,6 +6,7 @@
#include <linux/mmu_notifier.h>
#include <linux/sched.h>
+#include <asm/barrier.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
@@ -238,6 +239,78 @@ void flush_tlb_one_kernel(unsigned long
void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+static inline bool is_dyn_asid(u16 asid)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return true;
+
+ return asid < TLB_NR_DYN_ASIDS;
+}
+
+static inline bool is_global_asid(u16 asid)
+{
+ return !is_dyn_asid(asid);
+}
+
+static inline bool in_asid_transition(const struct flush_tlb_info *info)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ return info->mm && READ_ONCE(info->mm->context.asid_transition);
+}
+
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return 0;
+
+ asid = smp_load_acquire(&mm->context.global_asid);
+
+ /* mm->context.global_asid is either 0, or a global ASID */
+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
+
+ return asid;
+}
+#else
+static inline bool is_dyn_asid(u16 asid)
+{
+ return true;
+}
+
+static inline bool is_global_asid(u16 asid)
+{
+ return false;
+}
+
+static inline bool in_asid_transition(const struct flush_tlb_info *info)
+{
+ return false;
+}
+
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
+{
+ return false;
+}
+
+static inline void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ VM_WARN_ON_ONCE(1);
+}
+
+static inline void consider_global_asid(struct mm_struct *mm)
+{
+}
+#endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -74,13 +74,15 @@
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
- * the canonical identifier for an mm
+ * the canonical identifier for an mm, dynamically allocated on each CPU
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
+ * the canonical, global identifier for an mm, identical across all CPUs
*
- * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * kPCID - [1, MAX_ASID_AVAILABLE]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
@@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_st
return;
}
+ /*
+ * TLB consistency for global ASIDs is maintained with broadcast TLB
+ * flushing. The TLB is never outdated, and does not need flushing.
+ */
+ if (IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH) && static_cpu_has(X86_FEATURE_INVLPGB)) {
+ u16 global_asid = mm_global_asid(next);
+
+ if (global_asid) {
+ *new_asid = global_asid;
+ *need_flush = false;
+ return;
+ }
+ }
+
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
@@ -251,6 +267,272 @@ static void choose_new_asid(struct mm_st
*need_flush = true;
}
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+/*
+ * Logic for broadcast TLB invalidation.
+ */
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE) = { 0 };
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE) = { 0 };
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
+
+static void reset_global_asid_space(void)
+{
+ lockdep_assert_held(&global_asid_lock);
+
+ /*
+ * A global TLB flush guarantees that any stale entries from
+ * previously freed global ASIDs get flushed from the TLB
+ * everywhere, making these global ASIDs safe to reuse.
+ */
+ invlpgb_flush_all_nonglobals();
+
+ /*
+ * Clear all the previously freed global ASIDs from the
+ * broadcast_asid_used bitmap, now that the global TLB flush
+ * has made them actually available for re-use.
+ */
+ bitmap_andnot(global_asid_used, global_asid_used,
+ global_asid_freed, MAX_ASID_AVAILABLE);
+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
+
+ /*
+ * ASIDs 0-TLB_NR_DYN_ASIDS are used for CPU-local ASID
+ * assignments, for tasks doing IPI based TLB shootdowns.
+ * Restart the search from the start of the global ASID space.
+ */
+ last_global_asid = TLB_NR_DYN_ASIDS;
+}
+
+static u16 get_global_asid(void)
+{
+
+ u16 asid;
+
+ lockdep_assert_held(&global_asid_lock);
+
+ /* The previous allocated ASID is at the top of the address space. */
+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
+ reset_global_asid_space();
+
+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
+
+ if (asid >= MAX_ASID_AVAILABLE) {
+ /* This should never happen. */
+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", global_asid_available);
+ return 0;
+ }
+
+ /* Claim this global ASID. */
+ __set_bit(asid, global_asid_used);
+ last_global_asid = asid;
+ global_asid_available--;
+ return asid;
+}
+
+/*
+ * Returns true if the mm is transitioning from a CPU-local ASID to a global
+ * (INVLPGB) ASID, or the other way around.
+ */
+static bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
+{
+ u16 global_asid = mm_global_asid(next);
+
+ if (global_asid && prev_asid != global_asid)
+ return true;
+
+ if (!global_asid && is_global_asid(prev_asid))
+ return true;
+
+ return false;
+}
+
+void destroy_context_free_global_asid(struct mm_struct *mm)
+{
+ if (!mm->context.global_asid)
+ return;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* The global ASID can be re-used only after flush at wrap-around. */
+ __set_bit(mm->context.global_asid, global_asid_freed);
+
+ mm->context.global_asid = 0;
+ global_asid_available++;
+}
+
+/*
+ * Check whether a process is currently active on more than "threshold" CPUs.
+ * This is a cheap estimation on whether or not it may make sense to assign
+ * a global ASID to this process, and use broadcast TLB invalidation.
+ */
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
+{
+ int count = 0;
+ int cpu;
+
+ /* This quick check should eliminate most single threaded programs. */
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
+ return false;
+
+ /* Slower check to make sure. */
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /* Skip the CPUs that aren't really running this process. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
+ continue;
+
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ continue;
+
+ if (++count > threshold)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Assign a global ASID to the current process, protecting against
+ * races between multiple threads in the process.
+ */
+static void use_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* This process is already using broadcast TLB invalidation. */
+ if (mm->context.global_asid)
+ return;
+
+ /* The last global ASID was consumed while waiting for the lock. */
+ if (!global_asid_available) {
+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
+ return;
+ }
+
+ asid = get_global_asid();
+ if (!asid)
+ return;
+
+ /*
+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
+ * finish_asid_transition() needs to observe asid_transition = true
+ * once it observes global_asid.
+ */
+ mm->context.asid_transition = true;
+ smp_store_release(&mm->context.global_asid, asid);
+}
+
+static bool meets_global_asid_threshold(struct mm_struct *mm)
+{
+ if (!global_asid_available)
+ return false;
+
+ /*
+ * Assign a global ASID if the process is active on
+ * 4 or more CPUs simultaneously.
+ */
+ return mm_active_cpus_exceeds(mm, 3);
+}
+
+static void consider_global_asid(struct mm_struct *mm)
+{
+ if (!static_cpu_has(X86_FEATURE_INVLPGB))
+ return;
+
+ /* Check every once in a while. */
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
+ return;
+
+ if (meets_global_asid_threshold(mm))
+ use_global_asid(mm);
+}
+
+static void finish_asid_transition(struct flush_tlb_info *info)
+{
+ struct mm_struct *mm = info->mm;
+ int bc_asid = mm_global_asid(mm);
+ int cpu;
+
+ if (!READ_ONCE(mm->context.asid_transition))
+ return;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /*
+ * The remote CPU is context switching. Wait for that to
+ * finish, to catch the unlikely case of it switching to
+ * the target mm with an out of date ASID.
+ */
+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
+ cpu_relax();
+
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
+ continue;
+
+ /*
+ * If at least one CPU is not using the global ASID yet,
+ * send a TLB flush IPI. The IPI should cause stragglers
+ * to transition soon.
+ *
+ * This can race with the CPU switching to another task;
+ * that results in a (harmless) extra IPI.
+ */
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
+ flush_tlb_multi(mm_cpumask(info->mm), info);
+ return;
+ }
+ }
+
+ /* All the CPUs running this process are using the global ASID. */
+ WRITE_ONCE(mm->context.asid_transition, false);
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ bool pmd = info->stride_shift == PMD_SHIFT;
+ unsigned long maxnr = invlpgb_count_max;
+ unsigned long asid = info->mm->context.global_asid;
+ unsigned long addr = info->start;
+ unsigned long nr;
+
+ /* Flushing multiple pages at once is not supported with 1GB pages. */
+ if (info->stride_shift > PMD_SHIFT)
+ maxnr = 1;
+
+ /*
+ * TLB flushes with INVLPGB are kicked off asynchronously.
+ * The inc_mm_tlb_gen() guarantees page table updates are done
+ * before these TLB flushes happen.
+ */
+ if (info->end == TLB_FLUSH_ALL) {
+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_single_pcid_nosync(user_pcid(asid));
+ } else do {
+ /*
+ * Calculate how many pages can be flushed at once; if the
+ * remainder of the range is less than one page, flush one.
+ */
+ nr = min(maxnr, (info->end - addr) >> info->stride_shift);
+ nr = max(nr, 1);
+
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+
+ addr += nr << info->stride_shift;
+ } while (addr < info->end);
+
+ finish_asid_transition(info);
+
+ /* Wait for the INVLPGBs kicked off above to finish. */
+ tlbsync();
+}
+#endif /* CONFIG_X86_BROADCAST_TLB_FLUSH */
+
/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
@@ -556,8 +838,9 @@ void switch_mm_irqs_off(struct mm_struct
*/
if (prev == next) {
/* Not actually switching mm's */
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
- next->context.ctx_id);
+ VM_WARN_ON(is_dyn_asid(prev_asid) &&
+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ next->context.ctx_id);
/*
* If this races with another thread that enables lam, 'new_lam'
@@ -574,6 +857,23 @@ void switch_mm_irqs_off(struct mm_struct
cpumask_set_cpu(cpu, mm_cpumask(next));
/*
+ * Check if the current mm is transitioning to a new ASID.
+ */
+ if (needs_global_asid_reload(next, prev_asid)) {
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ goto reload_tlb;
+ }
+
+ /*
+ * Broadcast TLB invalidation keeps this PCID up to date
+ * all the time.
+ */
+ if (is_global_asid(prev_asid))
+ return;
+
+ /*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
* process. No TLB flush required.
@@ -607,6 +907,13 @@ void switch_mm_irqs_off(struct mm_struct
cond_mitigation(tsk);
/*
+ * Let nmi_uaccess_okay() and finish_asid_transition()
+ * know that we're changing CR3.
+ */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+
+ /*
* Stop remote flushes for the previous mm.
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
* but the bitmap manipulation can cause cache line contention.
@@ -623,14 +930,12 @@ void switch_mm_irqs_off(struct mm_struct
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
-
- /* Let nmi_uaccess_okay() know that we're changing CR3. */
- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
- barrier();
}
+reload_tlb:
new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
+ VM_WARN_ON_ONCE(is_global_asid(new_asid));
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
@@ -749,7 +1054,7 @@ static void flush_tlb_func(void *info)
const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ u64 local_tlb_gen;
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
u64 mm_tlb_gen;
@@ -769,6 +1074,16 @@ static void flush_tlb_func(void *info)
if (unlikely(loaded_mm == &init_mm))
return;
+ /* Reload the ASID if transitioning into or out of a global ASID */
+ if (needs_global_asid_reload(loaded_mm, loaded_mm_asid)) {
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ }
+
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
+ if (is_global_asid(loaded_mm_asid))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
@@ -786,6 +1101,8 @@ static void flush_tlb_func(void *info)
return;
}
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) {
/*
@@ -926,7 +1243,7 @@ STATIC_NOPV void native_flush_tlb_multi(
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables)
+ if (info->freed_tables || in_asid_transition(info))
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
@@ -1021,8 +1338,11 @@ void flush_tlb_mm_range(struct mm_struct
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ if (mm_global_asid(mm)) {
+ broadcast_tlb_flush(info);
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
flush_tlb_multi(mm_cpumask(mm), info);
+ consider_global_asid(mm);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();

View File

@ -0,0 +1,135 @@
From 647727eaa06fc61fbc55de4c09ab0c0fe7bc7263 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:29 -0500
Subject: x86/mm: do targeted broadcast flushing from tlbbatch code
Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
This also allows us to avoid adding the CPUs of processes using broadcast
flushing to the batch->cpumask, and will hopefully further reduce TLB
flushing from the reclaim and compaction paths.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/tlbbatch.h | 1 +
arch/x86/include/asm/tlbflush.h | 12 ++-----
arch/x86/mm/tlb.c | 57 +++++++++++++++++++++++++++++++--
3 files changed, 58 insertions(+), 12 deletions(-)
--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch {
* the PFNs being flushed..
*/
struct cpumask cpumask;
+ bool used_invlpgb;
};
#endif /* _ARCH_X86_TLBBATCH_H */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -358,21 +358,15 @@ static inline u64 inc_mm_tlb_gen(struct
return atomic64_inc_return(&mm->context.tlb_gen);
}
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
- struct mm_struct *mm,
- unsigned long uaddr)
-{
- inc_mm_tlb_gen(mm);
- cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
- mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
-}
-
static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
{
flush_tlb_mm(mm);
}
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr);
static inline bool pte_flags_need_flush(unsigned long oldflags,
unsigned long newflags,
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1612,9 +1612,7 @@ void arch_tlbbatch_flush(struct arch_tlb
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
- invlpgb_flush_all_nonglobals();
- } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
@@ -1623,12 +1621,65 @@ void arch_tlbbatch_flush(struct arch_tlb
local_irq_enable();
}
+ /*
+ * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+ * The cpumask above contains only CPUs that were running tasks
+ * not using broadcast TLB flushing.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
+ tlbsync();
+ migrate_enable();
+ batch->used_invlpgb = false;
+ }
+
cpumask_clear(&batch->cpumask);
put_flush_tlb_info();
put_cpu();
}
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr)
+{
+ u16 asid = mm_global_asid(mm);
+
+ if (asid) {
+ /*
+ * Queue up an asynchronous invalidation. The corresponding
+ * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
+ * on the same CPU.
+ */
+ if (!batch->used_invlpgb) {
+ batch->used_invlpgb = true;
+ migrate_disable();
+ }
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+
+ /*
+ * Some CPUs might still be using a local ASID for this
+ * process, and require IPIs, while others are using the
+ * global ASID.
+ *
+ * In this corner case we need to do both the broadcast
+ * TLB invalidation, and send IPIs. The IPIs will help
+ * stragglers transition to the broadcast ASID.
+ */
+ if (READ_ONCE(mm->context.asid_transition))
+ asid = 0;
+ }
+
+ if (!asid) {
+ inc_mm_tlb_gen(mm);
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ }
+
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
/*
* Blindly accessing user memory from NMI context can be dangerous
* if we're in the middle of switching the current user task or

View File

@ -0,0 +1,79 @@
From 0678da9f0870f0d211d49808a66e98abc0c58438 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:30 -0500
Subject: x86/mm: enable AMD translation cache extensions
With AMD TCE (translation cache extensions) only the intermediate mappings
that cover the address range zapped by INVLPG / INVLPGB get invalidated,
rather than all intermediate mappings getting zapped at every TLB invalidation.
This can help reduce the TLB miss rate, by keeping more intermediate
mappings in the cache.
From the AMD manual:
Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit
to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on
TLB entries. When this bit is 0, these instructions remove the target PTE
from the TLB as well as all upper-level table entries that are cached
in the TLB, whether or not they are associated with the target PTE.
When this bit is set, these instructions will remove the target PTE and
only those upper-level entries that lead to the target PTE in
the page table hierarchy, leaving unrelated upper-level entries intact.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/msr-index.h | 2 ++
arch/x86/kernel/cpu/amd.c | 4 ++++
tools/arch/x86/include/asm/msr-index.h | 2 ++
3 files changed, 8 insertions(+)
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -25,6 +25,7 @@
#define _EFER_SVME 12 /* Enable virtualization */
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
#define EFER_SCE (1<<_EFER_SCE)
@@ -34,6 +35,7 @@
#define EFER_SVME (1<<_EFER_SVME)
#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
+#define EFER_TCE (1<<_EFER_TCE)
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
/*
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1071,6 +1071,10 @@ static void init_amd(struct cpuinfo_x86
/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+ /* Enable Translation Cache Extension */
+ if (cpu_feature_enabled(X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}
#ifdef CONFIG_X86_32
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -25,6 +25,7 @@
#define _EFER_SVME 12 /* Enable virtualization */
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
#define EFER_SCE (1<<_EFER_SCE)
@@ -34,6 +35,7 @@
#define EFER_SVME (1<<_EFER_SVME)
#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
+#define EFER_TCE (1<<_EFER_TCE)
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
/*

View File

@ -0,0 +1,66 @@
From 02d1759eda082f9595f3232f5dffd5d49943924a Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 22 Jan 2025 23:23:31 -0500
Subject: x86/mm: only invalidate final translations with INVLPGB
Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB.
This way only leaf mappings get removed from the TLB, leaving intermediate
translations cached.
On the (rare) occasions where we free page tables we do a full flush,
ensuring intermediate translations get flushed from the TLB.
Signed-off-by: Rik van Riel <riel@surriel.com>
---
arch/x86/include/asm/invlpgb.h | 10 ++++++++--
arch/x86/mm/tlb.c | 8 ++++----
2 files changed, 12 insertions(+), 6 deletions(-)
--- a/arch/x86/include/asm/invlpgb.h
+++ b/arch/x86/include/asm/invlpgb.h
@@ -67,9 +67,15 @@ static inline void invlpgb_flush_user(un
static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
unsigned long addr,
u16 nr,
- bool pmd_stride)
+ bool pmd_stride,
+ bool freed_tables)
{
- __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+ unsigned long flags = INVLPGB_PCID | INVLPGB_VA;
+
+ if (!freed_tables)
+ flags |= INVLPGB_FINAL_ONLY;
+
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, flags);
}
/* Flush all mappings for a given PCID, not including globals. */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -518,10 +518,10 @@ static void broadcast_tlb_flush(struct f
nr = min(maxnr, (info->end - addr) >> info->stride_shift);
nr = max(nr, 1);
- invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd, info->freed_tables);
/* Do any CPUs supporting INVLPGB need PTI? */
if (static_cpu_has(X86_FEATURE_PTI))
- invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd, info->freed_tables);
addr += nr << info->stride_shift;
} while (addr < info->end);
@@ -1654,10 +1654,10 @@ void arch_tlbbatch_add_pending(struct ar
batch->used_invlpgb = true;
migrate_disable();
}
- invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false, false);
/* Do any CPUs supporting INVLPGB need PTI? */
if (static_cpu_has(X86_FEATURE_PTI))
- invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false, false);
/*
* Some CPUs might still be using a local ASID for this

View File

@ -0,0 +1,94 @@
From b61dfc43cfc7511795366dfd9260f0959ca2f2d2 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 19 Dec 2024 15:32:53 -0500
Subject: mm: remove unnecessary calls to lru_add_drain
There seem to be several categories of calls to lru_add_drain
and lru_add_drain_all.
The first are code paths that recently allocated, swapped in,
or otherwise processed a batch of pages, and want them all on
the LRU. These drain pages that were recently allocated,
probably on the local CPU.
A second category are code paths that are actively trying to
reclaim, migrate, or offline memory. These often use lru_add_drain_all,
to drain the caches on all CPUs.
However, there also seem to be some other callers where we
aren't really doing either. They are calling lru_add_drain(),
despite operating on pages that may have been allocated
long ago, and quite possibly on different CPUs.
Those calls are not likely to be effective at anything but
creating lock contention on the LRU locks.
Remove the lru_add_drain calls in the latter category.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
---
mm/memory.c | 1 -
mm/mmap.c | 2 --
mm/swap_state.c | 1 -
mm/vma.c | 2 --
4 files changed, 6 deletions(-)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are
struct mmu_notifier_range range;
struct mmu_gather tlb;
- lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1931,7 +1931,6 @@ void exit_mmap(struct mm_struct *mm)
goto destroy;
}
- lru_add_drain();
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
@@ -2374,7 +2373,6 @@ int relocate_vma_down(struct vm_area_str
vma, new_start, length, false, true))
return -ENOMEM;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
next = vma_next(&vmi);
if (new_end > old_start) {
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en
struct folio_batch folios;
unsigned int refs[PAGEVEC_SIZE];
- lru_add_drain();
folio_batch_init(&folios);
for (int i = 0; i < nr; i++) {
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas,
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct
* were isolated before we downgraded mmap_lock.
*/
mas_set(mas_detach, 1);
- lru_add_drain();
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
update_hiwater_rss(vms->vma->vm_mm);
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,

View File

@ -0,0 +1,429 @@
From e2d1ffb13e3909dab142f0f8ec8f934b79930717 Mon Sep 17 00:00:00 2001
From: Vincenzo Frascino <vincenzo.frascino@arm.com>
Date: Mon, 14 Oct 2024 16:13:39 +0100
Subject: vdso: Introduce vdso/page.h
The VDSO implementation includes headers from outside of the
vdso/ namespace.
Introduce vdso/page.h to make sure that the generic library
uses only the allowed namespace.
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k
Link: https://lore.kernel.org/all/20241014151340.1639555-3-vincenzo.frascino@arm.com
---
arch/alpha/include/asm/page.h | 6 +-----
arch/arc/include/uapi/asm/page.h | 7 +++----
arch/arm/include/asm/page.h | 5 +----
arch/arm64/include/asm/page-def.h | 5 +----
arch/csky/include/asm/page.h | 8 ++------
arch/hexagon/include/asm/page.h | 4 +---
arch/loongarch/include/asm/page.h | 7 +------
arch/m68k/include/asm/page.h | 6 ++----
arch/microblaze/include/asm/page.h | 5 +----
arch/mips/include/asm/page.h | 7 +------
arch/nios2/include/asm/page.h | 7 +------
arch/openrisc/include/asm/page.h | 11 +----------
arch/parisc/include/asm/page.h | 4 +---
arch/powerpc/include/asm/page.h | 10 +---------
arch/riscv/include/asm/page.h | 4 +---
arch/s390/include/asm/page.h | 13 +++++--------
arch/sh/include/asm/page.h | 6 ++----
arch/sparc/include/asm/page_32.h | 4 +---
arch/sparc/include/asm/page_64.h | 4 +---
arch/um/include/asm/page.h | 5 +----
arch/x86/include/asm/page_types.h | 5 +----
arch/xtensa/include/asm/page.h | 8 +-------
include/vdso/page.h | 30 ++++++++++++++++++++++++++++++
23 files changed, 61 insertions(+), 110 deletions(-)
create mode 100644 include/vdso/page.h
--- a/arch/alpha/include/asm/page.h
+++ b/arch/alpha/include/asm/page.h
@@ -4,11 +4,7 @@
#include <linux/const.h>
#include <asm/pal.h>
-
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
--- a/arch/arc/include/uapi/asm/page.h
+++ b/arch/arc/include/uapi/asm/page.h
@@ -14,7 +14,7 @@
/* PAGE_SHIFT determines the page size */
#ifdef __KERNEL__
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
+#include <vdso/page.h>
#else
/*
* Default 8k
@@ -24,11 +24,10 @@
* not available
*/
#define PAGE_SHIFT 13
+#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */
+#define PAGE_MASK (~(PAGE_SIZE-1))
#endif
-#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */
#define PAGE_OFFSET _AC(0x80000000, UL) /* Kernel starts at 2G onwrds */
-#define PAGE_MASK (~(PAGE_SIZE-1))
-
#endif /* _UAPI__ASM_ARC_PAGE_H */
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -7,10 +7,7 @@
#ifndef _ASMARM_PAGE_H
#define _ASMARM_PAGE_H
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
--- a/arch/arm64/include/asm/page-def.h
+++ b/arch/arm64/include/asm/page-def.h
@@ -10,9 +10,6 @@
#include <linux/const.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#endif /* __ASM_PAGE_DEF_H */
--- a/arch/csky/include/asm/page.h
+++ b/arch/csky/include/asm/page.h
@@ -7,12 +7,8 @@
#include <asm/cache.h>
#include <linux/const.h>
-/*
- * PAGE_SHIFT determines the page size: 4KB
- */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
+#include <vdso/page.h>
+
#define THREAD_SIZE (PAGE_SIZE * 2)
#define THREAD_MASK (~(THREAD_SIZE - 1))
#define THREAD_SHIFT (PAGE_SHIFT + 1)
--- a/arch/hexagon/include/asm/page.h
+++ b/arch/hexagon/include/asm/page.h
@@ -45,9 +45,7 @@
#define HVM_HUGEPAGE_SIZE 0x5
#endif
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (1UL << PAGE_SHIFT)
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
+#include <vdso/page.h>
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
--- a/arch/loongarch/include/asm/page.h
+++ b/arch/loongarch/include/asm/page.h
@@ -8,12 +8,7 @@
#include <linux/const.h>
#include <asm/addrspace.h>
-/*
- * PAGE_SHIFT determines the page size
- */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
+#include <vdso/page.h>
#define HPAGE_SHIFT (PAGE_SHIFT + PAGE_SHIFT - 3)
#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
--- a/arch/m68k/include/asm/page.h
+++ b/arch/m68k/include/asm/page.h
@@ -6,10 +6,8 @@
#include <asm/setup.h>
#include <asm/page_offset.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
+
#define PAGE_OFFSET (PAGE_OFFSET_RAW)
#ifndef __ASSEMBLY__
--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
@@ -19,10 +19,7 @@
#ifdef __KERNEL__
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#define LOAD_OFFSET ASM_CONST((CONFIG_KERNEL_START-CONFIG_KERNEL_BASE_ADDR))
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@@ -14,12 +14,7 @@
#include <linux/kernel.h>
#include <asm/mipsregs.h>
-/*
- * PAGE_SHIFT determines the page size
- */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
+#include <vdso/page.h>
/*
* This is used for calculating the real page sizes
--- a/arch/nios2/include/asm/page.h
+++ b/arch/nios2/include/asm/page.h
@@ -18,12 +18,7 @@
#include <linux/pfn.h>
#include <linux/const.h>
-/*
- * PAGE_SHIFT determines the page size
- */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
+#include <vdso/page.h>
/*
* PAGE_OFFSET -- the first address of the first page of memory.
--- a/arch/openrisc/include/asm/page.h
+++ b/arch/openrisc/include/asm/page.h
@@ -15,16 +15,7 @@
#ifndef __ASM_OPENRISC_PAGE_H
#define __ASM_OPENRISC_PAGE_H
-
-/* PAGE_SHIFT determines the page size */
-
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#ifdef __ASSEMBLY__
-#define PAGE_SIZE (1 << PAGE_SHIFT)
-#else
-#define PAGE_SIZE (1UL << PAGE_SHIFT)
-#endif
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#define PAGE_OFFSET 0xc0000000
#define KERNELBASE PAGE_OFFSET
--- a/arch/parisc/include/asm/page.h
+++ b/arch/parisc/include/asm/page.h
@@ -4,9 +4,7 @@
#include <linux/const.h>
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -21,8 +21,7 @@
* page size. When using 64K pages however, whether we are really supporting
* 64K pages in HW or not is irrelevant to those definitions.
*/
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT)
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
#ifndef CONFIG_HUGETLB_PAGE
@@ -42,13 +41,6 @@ extern unsigned int hpage_shift;
#endif
/*
- * Subtle: (1 << PAGE_SHIFT) is an int, not an unsigned long. So if we
- * assign PAGE_MASK to a larger type it gets extended the way we want
- * (i.e. with 1s in the high bits)
- */
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
-
-/*
* KERNELBASE is the virtual address of the start of the kernel, it's often
* the same as PAGE_OFFSET, but _might not be_.
*
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -12,9 +12,7 @@
#include <linux/pfn.h>
#include <linux/const.h>
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
+#include <vdso/page.h>
#define HPAGE_SHIFT PMD_SHIFT
#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -11,14 +11,11 @@
#include <linux/const.h>
#include <asm/types.h>
-#define _PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT)
-#define _PAGE_MASK (~(_PAGE_SIZE - 1))
+#include <vdso/page.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT _PAGE_SHIFT
-#define PAGE_SIZE _PAGE_SIZE
-#define PAGE_MASK _PAGE_MASK
+#define _PAGE_SHIFT PAGE_SHIFT
+#define _PAGE_SIZE PAGE_SIZE
+#define _PAGE_MASK PAGE_MASK
#define PAGE_DEFAULT_ACC _AC(0, UL)
/* storage-protection override */
#define PAGE_SPO_ACC 9
--- a/arch/sh/include/asm/page.h
+++ b/arch/sh/include/asm/page.h
@@ -8,10 +8,8 @@
#include <linux/const.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
+
#define PTE_MASK PAGE_MASK
#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
--- a/arch/sparc/include/asm/page_32.h
+++ b/arch/sparc/include/asm/page_32.h
@@ -11,9 +11,7 @@
#include <linux/const.h>
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -4,9 +4,7 @@
#include <linux/const.h>
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
/* Flushing for D-cache alias handling is only needed if
* the page size is smaller than 16K.
--- a/arch/um/include/asm/page.h
+++ b/arch/um/include/asm/page.h
@@ -9,10 +9,7 @@
#include <linux/const.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -6,10 +6,7 @@
#include <linux/types.h>
#include <linux/mem_encrypt.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
--- a/arch/xtensa/include/asm/page.h
+++ b/arch/xtensa/include/asm/page.h
@@ -18,13 +18,7 @@
#include <asm/cache.h>
#include <asm/kmem_layout.h>
-/*
- * PAGE_SHIFT determines the page size
- */
-
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (__XTENSA_UL_CONST(1) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#ifdef CONFIG_MMU
#define PAGE_OFFSET XCHAL_KSEG_CACHED_VADDR
--- /dev/null
+++ b/include/vdso/page.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_PAGE_H
+#define __VDSO_PAGE_H
+
+#include <uapi/linux/const.h>
+
+/*
+ * PAGE_SHIFT determines the page size.
+ *
+ * Note: This definition is required because PAGE_SHIFT is used
+ * in several places throuout the codebase.
+ */
+#define PAGE_SHIFT CONFIG_PAGE_SHIFT
+
+#define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT)
+
+#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT)
+/*
+ * Applies only to 32-bit architectures with a 64-bit phys_addr_t.
+ *
+ * Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long.
+ * So if we assign PAGE_MASK to a larger type it gets extended the
+ * way we want (i.e. with 1s in the high bits)
+ */
+#define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1))
+#else
+#define PAGE_MASK (~(PAGE_SIZE - 1))
+#endif
+
+#endif /* __VDSO_PAGE_H */

View File

@ -0,0 +1,68 @@
From 4478ee194402472199e05d3e27a87f0fc775cc18 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 24 Oct 2024 13:34:26 +0000
Subject: vdso: Change PAGE_MASK to signed on all 32-bit architectures
With the introduction of an architecture-independent defintion of
PAGE_MASK, we had to make a choice between defining it as 'unsigned long'
as on 64-bit architectures, or as signed 'long' as required for
architectures with a 64-bit phys_addr_t.
To reduce the risk for regressions and minimize the changes in behavior,
the result was using the signed value only when CONFIG_PHYS_ADDR_T_64BIT
is set, but that ended up causing a regression after all in the
early_init_dt_add_memory_arch() function that uses 64-bit integers for
address calculation.
Presumably the same regression also affects mips32 and powerpc32 when
dealing with large amounts of memory on DT platforms: like arm32, they were
using the signed version unconditionally.
The two most sensible options for addressing the regression are either to
go back to an architecture specific definition, using a signed constant on
arm/powerpc/mips and unsigned on the others, or to use the same definition
everywhere.
Use the simpler of those two and change them all to the signed version, in
the hope that this does not cause a different type of bug. Most of the
other 32-bit architectures have no large physical address support and are
rarely used, so it seems more likely that using the same definition helps
than hurts here.
In particular, x86-32 does have physical addressing extensions, so it
already changed to the signed version after the previous patch, so it makes
sense to use the same version on non-PAE as well.
Fixes: efe8419ae78d ("vdso: Introduce vdso/page.h")
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Anders Roxell <anders.roxell@linaro.org>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Link: https://lore.kernel.org/all/20241024133447.3117273-1-arnd@kernel.org
Link: https://lore.kernel.org/lkml/CA+G9fYt86bUAu_v5dXPWnDUwQNVipj+Wq3Djir1KUSKdr9QLNg@mail.gmail.com/
---
include/vdso/page.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/include/vdso/page.h
+++ b/include/vdso/page.h
@@ -14,13 +14,14 @@
#define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT)
-#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT)
+#if !defined(CONFIG_64BIT)
/*
- * Applies only to 32-bit architectures with a 64-bit phys_addr_t.
+ * Applies only to 32-bit architectures.
*
* Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long.
* So if we assign PAGE_MASK to a larger type it gets extended the
- * way we want (i.e. with 1s in the high bits)
+ * way we want (i.e. with 1s in the high bits) while masking a
+ * 64-bit value such as phys_addr_t.
*/
#define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1))
#else

View File

@ -102,7 +102,7 @@ Contains:
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6353,7 +6353,7 @@ retry:
@@ -6384,7 +6384,7 @@ retry:
return 0;
}
@ -111,7 +111,7 @@ Contains:
{
struct zone *zone;
unsigned long pfmemalloc_reserve = 0;
@@ -6382,6 +6382,10 @@ static bool allow_direct_reclaim(pg_data
@@ -6413,6 +6413,10 @@ static bool allow_direct_reclaim(pg_data
wmark_ok = free_pages > pfmemalloc_reserve / 2;
@ -122,7 +122,7 @@ Contains:
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
@@ -6447,7 +6451,7 @@ static bool throttle_direct_reclaim(gfp_
@@ -6478,7 +6482,7 @@ static bool throttle_direct_reclaim(gfp_
/* Throttle based on the first usable node */
pgdat = zone->zone_pgdat;
@ -131,7 +131,7 @@ Contains:
goto out;
break;
}
@@ -6469,11 +6473,14 @@ static bool throttle_direct_reclaim(gfp_
@@ -6500,11 +6504,14 @@ static bool throttle_direct_reclaim(gfp_
*/
if (!(gfp_mask & __GFP_FS))
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
@ -148,7 +148,7 @@ Contains:
if (fatal_signal_pending(current))
return true;
@@ -6976,14 +6983,14 @@ restart:
@@ -7007,14 +7014,14 @@ restart:
* able to safely make forward progress. Wake them
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&

View File

@ -0,0 +1,194 @@
From eacae6d88bcc8a925124f97b7788bb2bfac8b267 Mon Sep 17 00:00:00 2001
From: Vinay Banakar <vny@google.com>
Date: Mon, 20 Jan 2025 16:47:29 -0600
Subject: mm: Optimize TLB flushes during page reclaim
The current implementation in shrink_folio_list() performs full TLB
flushes and issues IPIs for each individual page being reclaimed. This
causes unnecessary overhead during memory reclaim, whether triggered
by madvise(MADV_PAGEOUT) or kswapd, especially in scenarios where
applications are actively moving cold pages to swap while maintaining
high performance requirements for hot pages.
The current code:
1. Clears PTE and unmaps each page individually
2. Performs a full TLB flush on all cores using the VMA (via CR3 write) or
issues individual TLB shootdowns (invlpg+invlpcid) for single-core usage
3. Submits each page individually to BIO
This approach results in:
- Excessive full TLB flushes across all cores
- Unnecessary IPI storms when processing multiple pages
- Suboptimal I/O submission patterns
I initially tried using selective TLB shootdowns (invlpg) instead of
full TLB flushes per each page to avoid interference with other
threads. However, this approach still required sending IPIs to all
cores for each page, which did not significantly improve application
throughput.
This patch instead optimizes the process by batching operations,
issuing one IPI per PMD instead of per page. This reduces interrupts
by a factor of 512 and enables batching page submissions to BIO. The
new approach:
1. Collect dirty pages that need to be written back
2. Issue a single TLB flush for all dirty pages in the batch
3. Process the collected pages for writebacks (submit to BIO)
Testing shows significant reduction in application throughput impact
during page-out operations. Applications maintain better performance
during memory reclaim, when triggered by explicit
madvise(MADV_PAGEOUT) calls.
I'd appreciate your feedback on this approach, especially on the
correctness of batched BIO submissions. Looking forward to your
comments.
Signed-off-by: Vinay Banakar <vny@google.com>
---
mm/vmscan.c | 120 ++++++++++++++++++++++++++++++++--------------------
1 file changed, 74 insertions(+), 46 deletions(-)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1053,6 +1053,7 @@ static unsigned int shrink_folio_list(st
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
+ LIST_HEAD(pageout_list);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
@@ -1365,52 +1366,9 @@ retry:
if (!sc->may_writepage)
goto keep_locked;
- /*
- * Folio is dirty. Flush the TLB if a writable entry
- * potentially exists to avoid CPU writes after I/O
- * starts and then write it out here.
- */
- try_to_unmap_flush_dirty();
- switch (pageout(folio, mapping, &plug, folio_list)) {
- case PAGE_KEEP:
- goto keep_locked;
- case PAGE_ACTIVATE:
- /*
- * If shmem folio is split when writeback to swap,
- * the tail pages will make their own pass through
- * this function and be accounted then.
- */
- if (nr_pages > 1 && !folio_test_large(folio)) {
- sc->nr_scanned -= (nr_pages - 1);
- nr_pages = 1;
- }
- goto activate_locked;
- case PAGE_SUCCESS:
- if (nr_pages > 1 && !folio_test_large(folio)) {
- sc->nr_scanned -= (nr_pages - 1);
- nr_pages = 1;
- }
- stat->nr_pageout += nr_pages;
-
- if (folio_test_writeback(folio))
- goto keep;
- if (folio_test_dirty(folio))
- goto keep;
-
- /*
- * A synchronous write - probably a ramdisk. Go
- * ahead and try to reclaim the folio.
- */
- if (!folio_trylock(folio))
- goto keep;
- if (folio_test_dirty(folio) ||
- folio_test_writeback(folio))
- goto keep_locked;
- mapping = folio_mapping(folio);
- fallthrough;
- case PAGE_CLEAN:
- ; /* try to free the folio below */
- }
+ /* Add to pageout list for defered bio submissions */
+ list_add(&folio->lru, &pageout_list);
+ continue;
}
/*
@@ -1521,6 +1479,76 @@ keep:
}
/* 'folio_list' is always empty here */
+ if (!list_empty(&pageout_list)) {
+ /*
+ * Batch TLB flushes by flushing once before processing all dirty pages.
+ * Since we operate on one PMD at a time, this batches TLB flushes at
+ * PMD granularity rather than per-page, reducing IPIs.
+ */
+ struct address_space *mapping;
+ try_to_unmap_flush_dirty();
+
+ while (!list_empty(&pageout_list)) {
+ struct folio *folio = lru_to_folio(&pageout_list);
+ list_del(&folio->lru);
+
+ /* Recheck if page got reactivated */
+ if (folio_test_active(folio) ||
+ (folio_mapped(folio) && folio_test_young(folio)))
+ goto skip_pageout_locked;
+
+ mapping = folio_mapping(folio);
+ pageout_t pageout_res = pageout(folio, mapping, &plug);
+ switch (pageout_res) {
+ case PAGE_KEEP:
+ goto skip_pageout_locked;
+ case PAGE_ACTIVATE:
+ goto skip_pageout_locked;
+ case PAGE_SUCCESS:
+ stat->nr_pageout += folio_nr_pages(folio);
+
+ if (folio_test_writeback(folio) ||
+ folio_test_dirty(folio))
+ goto skip_pageout;
+
+ /*
+ * A synchronous write - probably a ramdisk. Go
+ * ahead and try to reclaim the folio.
+ */
+ if (!folio_trylock(folio))
+ goto skip_pageout;
+ if (folio_test_dirty(folio) ||
+ folio_test_writeback(folio))
+ goto skip_pageout_locked;
+
+ // Try to free the page
+ if (!mapping ||
+ !__remove_mapping(mapping, folio, true,
+ sc->target_mem_cgroup))
+ goto skip_pageout_locked;
+
+ nr_reclaimed += folio_nr_pages(folio);
+ folio_unlock(folio);
+ continue;
+
+ case PAGE_CLEAN:
+ if (!mapping ||
+ !__remove_mapping(mapping, folio, true,
+ sc->target_mem_cgroup))
+ goto skip_pageout_locked;
+
+ nr_reclaimed += folio_nr_pages(folio);
+ folio_unlock(folio);
+ continue;
+ }
+
+skip_pageout_locked:
+ folio_unlock(folio);
+skip_pageout:
+ list_add(&folio->lru, &ret_folios);
+ }
+ }
+
/* Migrate folios selected for demotion */
stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
nr_reclaimed += stat->nr_demoted;

19
debian/patches/series vendored
View File

@ -47,7 +47,6 @@ features/x86/intel-iommu-add-option-to-exclude-integrated-gpu-only.patch
features/x86/intel-iommu-add-kconfig-option-to-exclude-igpu-by-default.patch
# Disable autoloading/probing of various drivers by default
debian/cdc_ncm-cdc_mbim-use-ncm-by-default.patch
debian/snd-pcsp-disable-autoload.patch
bugfix/x86/viafb-autoload-on-olpc-xo1.5-only.patch
debian/fjes-disable-autoload.patch
@ -203,6 +202,24 @@ patchset-xanmod/valve/0004-leds-steamdeck-Add-support-for-Steam-Deck-LED.patch
patchset-xanmod/valve/0005-mfd-Add-MFD-core-driver-for-Steam-Deck.patch
patchset-xanmod/valve/0006-mfd-steamdeck-Expose-controller-board-power-in-sysfs.patch
patchset-zen/nvlpgb-v7/0001-x86-mm-make-MMU_GATHER_RCU_TABLE_FREE-unconditional.patch
patchset-zen/nvlpgb-v7/0002-x86-mm-remove-pv_ops.mmu.tlb_remove_table-call.patch
patchset-zen/nvlpgb-v7/0003-x86-mm-consolidate-full-flush-threshold-decision.patch
patchset-zen/nvlpgb-v7/0004-x86-mm-get-INVLPGB-count-max-from-CPUID.patch
patchset-zen/nvlpgb-v7/0005-x86-mm-add-INVLPGB-support-code.patch
patchset-zen/nvlpgb-v7/0006-x86-mm-use-INVLPGB-for-kernel-TLB-flushes.patch
patchset-zen/nvlpgb-v7/0007-x86-mm-use-INVLPGB-in-flush_tlb_all.patch
patchset-zen/nvlpgb-v7/0008-x86-mm-use-broadcast-TLB-flushing-for-page-reclaim-T.patch
patchset-zen/nvlpgb-v7/0009-x86-mm-enable-broadcast-TLB-invalidation-for-multi-t.patch
patchset-zen/nvlpgb-v7/0010-x86-mm-do-targeted-broadcast-flushing-from-tlbbatch-.patch
patchset-zen/nvlpgb-v7/0011-x86-mm-enable-AMD-translation-cache-extensions.patch
patchset-zen/nvlpgb-v7/0012-x86-mm-only-invalidate-final-translations-with-INVLP.patch
patchset-zen/nvlpgb-v7/0013-mm-remove-unnecessary-calls-to-lru_add_drain.patch
patchset-zen/nvlpgb-v7/0014-vdso-Introduce-vdso-page.h.patch
patchset-zen/nvlpgb-v7/0015-vdso-Change-PAGE_MASK-to-signed-on-all-32-bit-archit.patch
patchset-zen/tlb/0001-mm-Optimize-TLB-flushes-during-page-reclaim.patch
patchset-xanmod/xanmod/0001-kbuild-Remove-GCC-minimal-function-alignment.patch
patchset-xanmod/xanmod/0002-XANMOD-fair-Set-scheduler-tunable-latencies-to-unsca.patch
patchset-xanmod/xanmod/0003-XANMOD-sched-Add-yield_type-sysctl-to-reduce-or-disa.patch