1
0

refresh patches

This commit is contained in:
2025-03-27 01:51:30 +03:00
parent 3d597650a9
commit b65c570ac2
239 changed files with 14214 additions and 9267 deletions

View File

@@ -0,0 +1,33 @@
From 6dada600ab3579296c9b2b57cf41b95792f021ed Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Sat, 13 Jan 2024 15:29:25 +0100
Subject: arch/Kconfig: Default to maximum amount of ASLR bits
To mitigate CVE-2024-26621 and improve randomization quality further. Do
this with a patch to avoid having to enable `CONFIG_EXPERT`.
Cherry-picked-for: https://zolutal.github.io/aslrnt/
---
arch/Kconfig | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1137,7 +1137,7 @@ config ARCH_MMAP_RND_BITS
int "Number of bits to use for ASLR of mmap base address" if EXPERT
range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
- default ARCH_MMAP_RND_BITS_MIN
+ default ARCH_MMAP_RND_BITS_MAX
depends on HAVE_ARCH_MMAP_RND_BITS
help
This value can be used to select the number of bits to use to
@@ -1171,7 +1171,7 @@ config ARCH_MMAP_RND_COMPAT_BITS
int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
- default ARCH_MMAP_RND_COMPAT_BITS_MIN
+ default ARCH_MMAP_RND_COMPAT_BITS_MAX
depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
help
This value can be used to select the number of bits to use to

View File

@@ -1,162 +0,0 @@
From 3c32c0d457a2c4b2817f57e1e2c9cbba4624639e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 22 Nov 2024 11:33:05 -0800
Subject: futex: improve user space accesses
Josh Poimboeuf reports that he got a "will-it-scale.per_process_ops 1.9%
improvement" report for his patch that changed __get_user() to use
pointer masking instead of the explicit speculation barrier. However,
that patch doesn't actually work in the general case, because some (very
bad) architecture-specific code actually depends on __get_user() also
working on kernel addresses.
A profile showed that the offending __get_user() was the futex code,
which really should be fixed up to not use that horrid legacy case.
Rewrite futex_get_value_locked() to use the modern user acccess helpers,
and inline it so that the compiler not only avoids the function call for
a few instructions, but can do CSE on the address masking.
It also turns out the x86 futex functions have unnecessary barriers in
other places, so let's fix those up too.
Link: https://lore.kernel.org/all/20241115230653.hfvzyf3aqqntgp63@jpoimboe/
Reported-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
arch/x86/include/asm/futex.h | 8 +++--
kernel/futex/core.c | 22 --------------
kernel/futex/futex.h | 59 ++++++++++++++++++++++++++++++++++--
3 files changed, 63 insertions(+), 26 deletions(-)
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -48,7 +48,9 @@ do { \
static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
u32 __user *uaddr)
{
- if (!user_access_begin(uaddr, sizeof(u32)))
+ if (can_do_masked_user_access())
+ uaddr = masked_user_access_begin(uaddr);
+ else if (!user_access_begin(uaddr, sizeof(u32)))
return -EFAULT;
switch (op) {
@@ -84,7 +86,9 @@ static inline int futex_atomic_cmpxchg_i
{
int ret = 0;
- if (!user_access_begin(uaddr, sizeof(u32)))
+ if (can_do_masked_user_access())
+ uaddr = masked_user_access_begin(uaddr);
+ else if (!user_access_begin(uaddr, sizeof(u32)))
return -EFAULT;
asm volatile("\n"
"1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -451,28 +451,6 @@ struct futex_q *futex_top_waiter(struct
return NULL;
}
-int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
-{
- int ret;
-
- pagefault_disable();
- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
- pagefault_enable();
-
- return ret;
-}
-
-int futex_get_value_locked(u32 *dest, u32 __user *from)
-{
- int ret;
-
- pagefault_disable();
- ret = __get_user(*dest, from);
- pagefault_enable();
-
- return ret ? -EFAULT : 0;
-}
-
/**
* wait_for_owner_exiting - Block until the owner has exited
* @ret: owner's current futex lock status
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -6,6 +6,7 @@
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
#include <linux/compat.h>
+#include <linux/uaccess.h>
#ifdef CONFIG_PREEMPT_RT
#include <linux/rcuwait.h>
@@ -225,10 +226,64 @@ extern bool __futex_wake_mark(struct fut
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
extern int fault_in_user_writeable(u32 __user *uaddr);
-extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
-extern int futex_get_value_locked(u32 *dest, u32 __user *from);
extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
+static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+ pagefault_enable();
+
+ return ret;
+}
+
+/*
+ * This does a plain atomic user space read, and the user pointer has
+ * already been verified earlier by get_futex_key() to be both aligned
+ * and actually in user space, just like futex_atomic_cmpxchg_inatomic().
+ *
+ * We still want to avoid any speculation, and while __get_user() is
+ * the traditional model for this, it's actually slower then doing
+ * this manually these days.
+ *
+ * We could just have a per-architecture special function for it,
+ * the same way we do futex_atomic_cmpxchg_inatomic(), but rather
+ * than force everybody to do that, write it out long-hand using
+ * the low-level user-access infrastructure.
+ *
+ * This looks a bit overkill, but generally just results in a couple
+ * of instructions.
+ */
+static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
+{
+ u32 val;
+
+ if (can_do_masked_user_access())
+ from = masked_user_access_begin(from);
+ else if (!user_read_access_begin(from, sizeof(*from)))
+ return -EFAULT;
+ unsafe_get_user(val, from, Efault);
+ user_access_end();
+ *dest = val;
+ return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_read_inatomic(dest, from);
+ pagefault_enable();
+
+ return ret;
+}
+
extern void __futex_unqueue(struct futex_q *q);
extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
struct task_struct *task);

View File

@@ -0,0 +1,85 @@
From 5ac90c5aed97728c8f4f64c02d75334c84a801ef Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javierm@redhat.com>
Date: Thu, 19 May 2022 14:40:07 +0200
Subject: drivers/firmware: skip simpledrm if nvidia-drm.modeset=1 is set
The Nvidia proprietary driver has some bugs that leads to issues if used
with the simpledrm driver. The most noticeable is that does not register
an emulated fbdev device.
It just relies on a fbdev to be registered by another driver, that could
be that could be attached to the framebuffer console. On UEFI machines,
this is the efifb driver.
This means that disabling the efifb driver will cause virtual consoles to
not be present in the system when using the Nvidia driver. Legacy BIOS is
not affected just because fbcon is not used there, but instead vgacon.
Unless a VGA mode is specified using the vga= kernel command line option,
in that case the vesafb driver is used instead and its fbdev attached to
the fbcon.
This is a problem because with CONFIG_SYSFB_SIMPLEFB=y, the sysfb platform
code attempts to register a "simple-framebuffer" platform device (that is
matched against simpledrm) and only registers either an "efi-framebuffer"
or "vesa-framebuffer" if this fails to be registered due the video modes
not being compatible.
The Nvidia driver relying on another driver to register the fbdev is quite
fragile, since it can't really assume those will stick around. For example
there are patches posted to remove the EFI and VESA platform devices once
a real DRM or fbdev driver probes.
But in any case, moving to a simpledrm + emulated fbdev only breaks this
assumption and causes users to not have VT if the Nvidia driver is used.
So to prevent this, let's add a workaround and make the sysfb to skip the
"simple-framebuffer" registration when nvidia-drm.modeset=1 option is set.
This is quite horrible, but honestly I can't think of any other approach.
For this to work, the CONFIG_FB_EFI and CONFIG_FB_VESA config options must
be enabled besides CONFIG_DRM_SIMPLEDRM.
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
Source: https://gitlab.com/cki-project/kernel-ark/-/merge_requests/1788
Cherry-picked-for: https://bugs.archlinux.org/task/73720
Cherry-picked-for: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/94
---
drivers/firmware/sysfb.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
--- a/drivers/firmware/sysfb.c
+++ b/drivers/firmware/sysfb.c
@@ -35,6 +35,22 @@
#include <linux/screen_info.h>
#include <linux/sysfb.h>
+static int skip_simpledrm;
+
+static int __init simpledrm_disable(char *opt)
+{
+ if (!opt)
+ return -EINVAL;
+
+ get_option(&opt, &skip_simpledrm);
+
+ if (skip_simpledrm)
+ pr_info("The simpledrm driver will not be probed\n");
+
+ return 0;
+}
+early_param("nvidia-drm.modeset", simpledrm_disable);
+
static struct platform_device *pd;
static DEFINE_MUTEX(disable_lock);
static bool disabled;
@@ -164,7 +180,7 @@ static __init int sysfb_init(void)
/* try to create a simple-framebuffer device */
compatible = sysfb_parse_mode(si, &mode);
- if (compatible) {
+ if (compatible && !skip_simpledrm) {
pd = sysfb_create_simplefb(si, &mode, parent);
if (!IS_ERR(pd))
goto put_device;

View File

@@ -0,0 +1,56 @@
From 69907adec3041a6a89d192441a61481d80ee5806 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Wed, 12 Feb 2025 16:33:54 +0800
Subject: EDAC/igen6: Fix the flood of invalid error reports
The ECC_ERROR_LOG register of certain SoCs may contain the invalid value
~0, which results in a flood of invalid error reports in polling mode.
Fix the flood of invalid error reports by skipping the invalid ECC error
log value ~0.
Fixes: e14232afa944 ("EDAC/igen6: Add polling support")
Reported-by: Ramses <ramses@well-founded.dev>
Closes: https://lore.kernel.org/all/OISL8Rv--F-9@well-founded.dev/
Tested-by: Ramses <ramses@well-founded.dev>
Reported-by: John <therealgraysky@proton.me>
Closes: https://lore.kernel.org/all/p5YcxOE6M3Ncxpn2-Ia_wCt61EM4LwIiN3LroQvT_-G2jMrFDSOW5k2A9D8UUzD2toGpQBN1eI0sL5dSKnkO8iteZegLoQEj-DwQaMhGx4A=@proton.me/
Tested-by: John <therealgraysky@proton.me>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20250212083354.31919-1-qiuxu.zhuo@intel.com
---
drivers/edac/igen6_edac.c | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
--- a/drivers/edac/igen6_edac.c
+++ b/drivers/edac/igen6_edac.c
@@ -785,13 +785,22 @@ static u64 ecclog_read_and_clear(struct
{
u64 ecclog = readq(imc->window + ECC_ERROR_LOG_OFFSET);
- if (ecclog & (ECC_ERROR_LOG_CE | ECC_ERROR_LOG_UE)) {
- /* Clear CE/UE bits by writing 1s */
- writeq(ecclog, imc->window + ECC_ERROR_LOG_OFFSET);
- return ecclog;
- }
+ /*
+ * Quirk: The ECC_ERROR_LOG register of certain SoCs may contain
+ * the invalid value ~0. This will result in a flood of invalid
+ * error reports in polling mode. Skip it.
+ */
+ if (ecclog == ~0)
+ return 0;
- return 0;
+ /* Neither a CE nor a UE. Skip it.*/
+ if (!(ecclog & (ECC_ERROR_LOG_CE | ECC_ERROR_LOG_UE)))
+ return 0;
+
+ /* Clear CE/UE bits by writing 1s */
+ writeq(ecclog, imc->window + ECC_ERROR_LOG_OFFSET);
+
+ return ecclog;
}
static void errsts_clear(struct igen6_imc *imc)

View File

@@ -1,113 +0,0 @@
From 7ac6508c4db81eced5f6e3d7c8913af1da6cf110 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:22 -0500
Subject: x86/mm: consolidate full flush threshold decision
Reduce code duplication by consolidating the decision point
for whether to do individual invalidations or a full flush
inside get_flush_tlb_info.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
---
arch/x86/mm/tlb.c | 56 ++++++++++++++++++++++++++---------------------
1 file changed, 31 insertions(+), 25 deletions(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1000,8 +1000,13 @@ static struct flush_tlb_info *get_flush_
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
#endif
- info->start = start;
- info->end = end;
+ /*
+ * Round the start and end addresses to the page size specified
+ * by the stride shift. This ensures partial pages at the end of
+ * a range get fully invalidated.
+ */
+ info->start = round_down(start, 1 << stride_shift);
+ info->end = round_up(end, 1 << stride_shift);
info->mm = mm;
info->stride_shift = stride_shift;
info->freed_tables = freed_tables;
@@ -1009,6 +1014,19 @@ static struct flush_tlb_info *get_flush_
info->initiating_cpu = smp_processor_id();
info->trim_cpumask = 0;
+ WARN_ONCE(start != info->start || end != info->end,
+ "TLB flush not stride %x aligned. Start %lx, end %lx\n",
+ 1 << stride_shift, start, end);
+
+ /*
+ * If the number of flushes is so large that a full flush
+ * would be faster, do a full flush.
+ */
+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+ info->start = 0;
+ info->end = TLB_FLUSH_ALL;
+ }
+
return info;
}
@@ -1026,17 +1044,8 @@ void flush_tlb_mm_range(struct mm_struct
bool freed_tables)
{
struct flush_tlb_info *info;
+ int cpu = get_cpu();
u64 new_tlb_gen;
- int cpu;
-
- cpu = get_cpu();
-
- /* Should we flush just the requested range? */
- if ((end == TLB_FLUSH_ALL) ||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
- start = 0;
- end = TLB_FLUSH_ALL;
- }
/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1089,22 +1098,19 @@ static void do_kernel_range_flush(void *
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- /* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
- on_each_cpu(do_flush_tlb_all, NULL, 1);
- } else {
- struct flush_tlb_info *info;
+ struct flush_tlb_info *info;
+
+ guard(preempt)();
- preempt_disable();
- info = get_flush_tlb_info(NULL, start, end, 0, false,
- TLB_GENERATION_INVALID);
+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+ TLB_GENERATION_INVALID);
+ if (info->end == TLB_FLUSH_ALL)
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ else
on_each_cpu(do_kernel_range_flush, info, 1);
- put_flush_tlb_info();
- preempt_enable();
- }
+ put_flush_tlb_info();
}
/*
@@ -1276,7 +1282,7 @@ void arch_tlbbatch_flush(struct arch_tlb
int cpu = get_cpu();
- info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
+ info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, PAGE_SHIFT, false,
TLB_GENERATION_INVALID);
/*
* flush_tlb_multi() is not optimized for the common case in which only

View File

@@ -1,90 +0,0 @@
From e772b2eb66e5c3cf668feadab678f2a88d896189 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:23 -0500
Subject: x86/mm: get INVLPGB count max from CPUID
The CPU advertises the maximum number of pages that can be shot down
with one INVLPGB instruction in the CPUID data.
Save that information for later use.
Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
---
arch/x86/Kconfig.cpu | 5 +++++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/tlbflush.h | 7 +++++++
arch/x86/kernel/cpu/amd.c | 8 ++++++++
4 files changed, 21 insertions(+)
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -726,6 +726,10 @@ config X86_VMX_FEATURE_NAMES
def_bool y
depends on IA32_FEAT_CTL
+config X86_BROADCAST_TLB_FLUSH
+ def_bool y
+ depends on CPU_SUP_AMD && 64BIT
+
menuconfig PROCESSOR_SELECT
bool "Supported processor vendors" if EXPERT
help
@@ -762,6 +766,7 @@ config CPU_SUP_CYRIX_32
config CPU_SUP_AMD
default y
bool "Support AMD processors" if PROCESSOR_SELECT
+ select X86_BROADCAST_TLB_FLUSH
help
This enables detection, tunings and quirks for AMD processors
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -335,6 +335,7 @@
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instruction supported. */
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -183,6 +183,13 @@ static inline void cr4_init_shadow(void)
extern unsigned long mmu_cr4_features;
extern u32 *trampoline_cr4_features;
+/* How many pages can we invalidate with one INVLPGB. */
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+extern u16 invlpgb_count_max;
+#else
+#define invlpgb_count_max 1
+#endif
+
extern void initialize_tlbstate_and_flush(void);
/*
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -29,6 +29,8 @@
#include "cpu.h"
+u16 invlpgb_count_max __ro_after_init;
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -1135,6 +1137,12 @@ static void cpu_detect_tlb_amd(struct cp
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (boot_cpu_has(X86_FEATURE_INVLPGB)) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+ invlpgb_count_max = (edx & 0xffff) + 1;
+ }
}
static const struct cpu_dev amd_cpu_dev = {

View File

@@ -1,130 +0,0 @@
From 7a896b12875e2b988acbf0229fb4bcf9157b83bd Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:24 -0500
Subject: x86/mm: add INVLPGB support code
Add invlpgb.h with the helper functions and definitions needed to use
broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.
Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
---
arch/x86/include/asm/invlpgb.h | 101 ++++++++++++++++++++++++++++++++
arch/x86/include/asm/tlbflush.h | 1 +
2 files changed, 102 insertions(+)
create mode 100644 arch/x86/include/asm/invlpgb.h
--- /dev/null
+++ b/arch/x86/include/asm/invlpgb.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVLPGB
+#define _ASM_X86_INVLPGB
+
+#include <linux/kernel.h>
+#include <vdso/bits.h>
+#include <vdso/page.h>
+
+/*
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+ *
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+ * be done in a parallel fashion.
+ *
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+ * this CPU have completed.
+ */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 extra_count,
+ bool pmd_stride, u8 flags)
+{
+ u32 edx = (pcid << 16) | asid;
+ u32 ecx = (pmd_stride << 31) | extra_count;
+ u64 rax = addr | flags;
+
+ /* The low bits in rax are for flags. Verify addr is clean. */
+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
+
+ /* INVLPGB; supported in binutils >= 2.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xfe" : : "a" (rax), "c" (ecx), "d" (edx));
+}
+
+/* Wait for INVLPGB originated by this CPU to complete. */
+static inline void tlbsync(void)
+{
+ cant_migrate();
+ /* TLBSYNC: supported in binutils >= 0.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
+}
+
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - INVLPGB_PCID: invalidate all TLB entries matching the PCID
+ *
+ * The first can be used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_VA BIT(0)
+#define INVLPGB_PCID BIT(1)
+#define INVLPGB_ASID BIT(2)
+#define INVLPGB_INCLUDE_GLOBAL BIT(3)
+#define INVLPGB_FINAL_ONLY BIT(4)
+#define INVLPGB_INCLUDE_NESTED BIT(5)
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invlpgb_flush_user(unsigned long pcid,
+ unsigned long addr)
+{
+ __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
+ tlbsync();
+}
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ unsigned long addr,
+ u16 nr,
+ bool pmd_stride)
+{
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+ __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invlpgb_flush_all(void)
+{
+ __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
+ tlbsync();
+}
+
+/* Flush addr, including globals, for all PCIDs. */
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+ __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invlpgb_flush_all_nonglobals(void)
+{
+ __invlpgb(0, 0, 0, 0, 0, 0);
+ tlbsync();
+}
+
+#endif /* _ASM_X86_INVLPGB */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,6 +10,7 @@
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
#include <asm/smp.h>
+#include <asm/invlpgb.h>
#include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>

View File

@@ -1,59 +0,0 @@
From 99f2b0eda74d7ec76c9c48b78f9d30d251501c28 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:25 -0500
Subject: x86/mm: use INVLPGB for kernel TLB flushes
Use broadcast TLB invalidation for kernel addresses when available.
Remove the need to send IPIs for kernel TLB flushes.
Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
---
arch/x86/mm/tlb.c | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1086,6 +1086,30 @@ void flush_tlb_all(void)
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
+static bool broadcast_kernel_range_flush(struct flush_tlb_info *info)
+{
+ unsigned long addr;
+ unsigned long nr;
+
+ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH))
+ return false;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ if (info->end == TLB_FLUSH_ALL) {
+ invlpgb_flush_all();
+ return true;
+ }
+
+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+ nr = min((info->end - addr) >> PAGE_SHIFT, invlpgb_count_max);
+ invlpgb_flush_addr_nosync(addr, nr);
+ }
+ tlbsync();
+ return true;
+}
+
static void do_kernel_range_flush(void *info)
{
struct flush_tlb_info *f = info;
@@ -1105,7 +1129,9 @@ void flush_tlb_kernel_range(unsigned lon
info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
TLB_GENERATION_INVALID);
- if (info->end == TLB_FLUSH_ALL)
+ if (broadcast_kernel_range_flush(info))
+ ; /* Fall through. */
+ else if (info->end == TLB_FLUSH_ALL)
on_each_cpu(do_flush_tlb_all, NULL, 1);
else
on_each_cpu(do_kernel_range_flush, info, 1);

View File

@@ -1,45 +0,0 @@
From 1ef7edb5b2375d4010ed2ad0c7d87fcfa7ab4519 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:26 -0500
Subject: x86/mm: use INVLPGB in flush_tlb_all
The flush_tlb_all() function is not used a whole lot, but we might
as well use broadcast TLB flushing there, too.
Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
---
arch/x86/mm/tlb.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1074,6 +1074,19 @@ void flush_tlb_mm_range(struct mm_struct
}
+static bool broadcast_flush_tlb_all(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH))
+ return false;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ guard(preempt)();
+ invlpgb_flush_all();
+ return true;
+}
+
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -1082,6 +1095,8 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
+ if (broadcast_flush_tlb_all())
+ return;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}

View File

@@ -1,603 +0,0 @@
From c7212dc64d8e9e4f12f1c6edea3b75c350a30381 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:28 -0500
Subject: x86/mm: enable broadcast TLB invalidation for multi-threaded
processes
Use broadcast TLB invalidation, using the INVPLGB instruction, on AMD EPYC 3
and newer CPUs.
In order to not exhaust PCID space, and keep TLB flushes local for single
threaded processes, we only hand out broadcast ASIDs to processes active on
4 or more CPUs.
Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
---
arch/x86/include/asm/mmu.h | 6 +
arch/x86/include/asm/mmu_context.h | 14 ++
arch/x86/include/asm/tlbflush.h | 73 ++++++
arch/x86/mm/tlb.c | 344 ++++++++++++++++++++++++++++-
4 files changed, 425 insertions(+), 12 deletions(-)
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -69,6 +69,12 @@ typedef struct {
u16 pkey_allocation_map;
s16 execute_only_pkey;
#endif
+
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+ u16 global_asid;
+ bool asid_transition;
+#endif
+
} mm_context_t;
#define INIT_MM_CONTEXT(mm) \
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+extern void destroy_context_free_global_asid(struct mm_struct *mm);
+
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
@@ -161,6 +163,14 @@ static inline int init_new_context(struc
mm->context.execute_only_pkey = -1;
}
#endif
+
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ mm->context.global_asid = 0;
+ mm->context.asid_transition = false;
+ }
+#endif
+
mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
@@ -170,6 +180,10 @@ static inline int init_new_context(struc
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ destroy_context_free_global_asid(mm);
+#endif
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -6,6 +6,7 @@
#include <linux/mmu_notifier.h>
#include <linux/sched.h>
+#include <asm/barrier.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
@@ -239,6 +240,78 @@ void flush_tlb_one_kernel(unsigned long
void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+static inline bool is_dyn_asid(u16 asid)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return true;
+
+ return asid < TLB_NR_DYN_ASIDS;
+}
+
+static inline bool is_global_asid(u16 asid)
+{
+ return !is_dyn_asid(asid);
+}
+
+static inline bool in_asid_transition(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ return mm && READ_ONCE(mm->context.asid_transition);
+}
+
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return 0;
+
+ asid = smp_load_acquire(&mm->context.global_asid);
+
+ /* mm->context.global_asid is either 0, or a global ASID */
+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
+
+ return asid;
+}
+#else
+static inline bool is_dyn_asid(u16 asid)
+{
+ return true;
+}
+
+static inline bool is_global_asid(u16 asid)
+{
+ return false;
+}
+
+static inline bool in_asid_transition(struct mm_struct *mm)
+{
+ return false;
+}
+
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
+{
+ return false;
+}
+
+static inline void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ VM_WARN_ON_ONCE(1);
+}
+
+static inline void consider_global_asid(struct mm_struct *mm)
+{
+}
+#endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -74,13 +74,15 @@
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
- * the canonical identifier for an mm
+ * the canonical identifier for an mm, dynamically allocated on each CPU
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
+ * the canonical, global identifier for an mm, identical across all CPUs
*
- * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * kPCID - [1, MAX_ASID_AVAILABLE]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
@@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_st
return;
}
+ /*
+ * TLB consistency for global ASIDs is maintained with broadcast TLB
+ * flushing. The TLB is never outdated, and does not need flushing.
+ */
+ if (IS_ENABLED(CONFIG_X86_BROADCAST_TLB_FLUSH) && static_cpu_has(X86_FEATURE_INVLPGB)) {
+ u16 global_asid = mm_global_asid(next);
+
+ if (global_asid) {
+ *new_asid = global_asid;
+ *need_flush = false;
+ return;
+ }
+ }
+
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
@@ -251,6 +267,272 @@ static void choose_new_asid(struct mm_st
*need_flush = true;
}
+#ifdef CONFIG_X86_BROADCAST_TLB_FLUSH
+/*
+ * Logic for broadcast TLB invalidation.
+ */
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE) = { 0 };
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE) = { 0 };
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
+
+static void reset_global_asid_space(void)
+{
+ lockdep_assert_held(&global_asid_lock);
+
+ /*
+ * A global TLB flush guarantees that any stale entries from
+ * previously freed global ASIDs get flushed from the TLB
+ * everywhere, making these global ASIDs safe to reuse.
+ */
+ invlpgb_flush_all_nonglobals();
+
+ /*
+ * Clear all the previously freed global ASIDs from the
+ * broadcast_asid_used bitmap, now that the global TLB flush
+ * has made them actually available for re-use.
+ */
+ bitmap_andnot(global_asid_used, global_asid_used,
+ global_asid_freed, MAX_ASID_AVAILABLE);
+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
+
+ /*
+ * ASIDs 0-TLB_NR_DYN_ASIDS are used for CPU-local ASID
+ * assignments, for tasks doing IPI based TLB shootdowns.
+ * Restart the search from the start of the global ASID space.
+ */
+ last_global_asid = TLB_NR_DYN_ASIDS;
+}
+
+static u16 get_global_asid(void)
+{
+
+ u16 asid;
+
+ lockdep_assert_held(&global_asid_lock);
+
+ /* The previous allocated ASID is at the top of the address space. */
+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
+ reset_global_asid_space();
+
+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
+
+ if (asid >= MAX_ASID_AVAILABLE) {
+ /* This should never happen. */
+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", global_asid_available);
+ return 0;
+ }
+
+ /* Claim this global ASID. */
+ __set_bit(asid, global_asid_used);
+ last_global_asid = asid;
+ global_asid_available--;
+ return asid;
+}
+
+/*
+ * Returns true if the mm is transitioning from a CPU-local ASID to a global
+ * (INVLPGB) ASID, or the other way around.
+ */
+static bool needs_global_asid_reload(struct mm_struct *next, u16 prev_asid)
+{
+ u16 global_asid = mm_global_asid(next);
+
+ if (global_asid && prev_asid != global_asid)
+ return true;
+
+ if (!global_asid && is_global_asid(prev_asid))
+ return true;
+
+ return false;
+}
+
+void destroy_context_free_global_asid(struct mm_struct *mm)
+{
+ if (!mm->context.global_asid)
+ return;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* The global ASID can be re-used only after flush at wrap-around. */
+ __set_bit(mm->context.global_asid, global_asid_freed);
+
+ mm->context.global_asid = 0;
+ global_asid_available++;
+}
+
+/*
+ * Check whether a process is currently active on more than "threshold" CPUs.
+ * This is a cheap estimation on whether or not it may make sense to assign
+ * a global ASID to this process, and use broadcast TLB invalidation.
+ */
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
+{
+ int count = 0;
+ int cpu;
+
+ /* This quick check should eliminate most single threaded programs. */
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
+ return false;
+
+ /* Slower check to make sure. */
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /* Skip the CPUs that aren't really running this process. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
+ continue;
+
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ continue;
+
+ if (++count > threshold)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Assign a global ASID to the current process, protecting against
+ * races between multiple threads in the process.
+ */
+static void use_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* This process is already using broadcast TLB invalidation. */
+ if (mm->context.global_asid)
+ return;
+
+ /* The last global ASID was consumed while waiting for the lock. */
+ if (!global_asid_available) {
+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
+ return;
+ }
+
+ asid = get_global_asid();
+ if (!asid)
+ return;
+
+ /*
+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
+ * finish_asid_transition() needs to observe asid_transition = true
+ * once it observes global_asid.
+ */
+ mm->context.asid_transition = true;
+ smp_store_release(&mm->context.global_asid, asid);
+}
+
+static bool meets_global_asid_threshold(struct mm_struct *mm)
+{
+ if (!global_asid_available)
+ return false;
+
+ /*
+ * Assign a global ASID if the process is active on
+ * 4 or more CPUs simultaneously.
+ */
+ return mm_active_cpus_exceeds(mm, 3);
+}
+
+static void consider_global_asid(struct mm_struct *mm)
+{
+ if (!static_cpu_has(X86_FEATURE_INVLPGB))
+ return;
+
+ /* Check every once in a while. */
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
+ return;
+
+ if (meets_global_asid_threshold(mm))
+ use_global_asid(mm);
+}
+
+static void finish_asid_transition(struct flush_tlb_info *info)
+{
+ struct mm_struct *mm = info->mm;
+ int bc_asid = mm_global_asid(mm);
+ int cpu;
+
+ if (!READ_ONCE(mm->context.asid_transition))
+ return;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /*
+ * The remote CPU is context switching. Wait for that to
+ * finish, to catch the unlikely case of it switching to
+ * the target mm with an out of date ASID.
+ */
+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
+ cpu_relax();
+
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
+ continue;
+
+ /*
+ * If at least one CPU is not using the global ASID yet,
+ * send a TLB flush IPI. The IPI should cause stragglers
+ * to transition soon.
+ *
+ * This can race with the CPU switching to another task;
+ * that results in a (harmless) extra IPI.
+ */
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
+ flush_tlb_multi(mm_cpumask(info->mm), info);
+ return;
+ }
+ }
+
+ /* All the CPUs running this process are using the global ASID. */
+ WRITE_ONCE(mm->context.asid_transition, false);
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ bool pmd = info->stride_shift == PMD_SHIFT;
+ unsigned long maxnr = invlpgb_count_max;
+ unsigned long asid = info->mm->context.global_asid;
+ unsigned long addr = info->start;
+ unsigned long nr;
+
+ /* Flushing multiple pages at once is not supported with 1GB pages. */
+ if (info->stride_shift > PMD_SHIFT)
+ maxnr = 1;
+
+ /*
+ * TLB flushes with INVLPGB are kicked off asynchronously.
+ * The inc_mm_tlb_gen() guarantees page table updates are done
+ * before these TLB flushes happen.
+ */
+ if (info->end == TLB_FLUSH_ALL) {
+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_single_pcid_nosync(user_pcid(asid));
+ } else do {
+ /*
+ * Calculate how many pages can be flushed at once; if the
+ * remainder of the range is less than one page, flush one.
+ */
+ nr = min(maxnr, (info->end - addr) >> info->stride_shift);
+ nr = max(nr, 1);
+
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+
+ addr += nr << info->stride_shift;
+ } while (addr < info->end);
+
+ finish_asid_transition(info);
+
+ /* Wait for the INVLPGBs kicked off above to finish. */
+ tlbsync();
+}
+#endif /* CONFIG_X86_BROADCAST_TLB_FLUSH */
+
/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
@@ -556,8 +838,9 @@ void switch_mm_irqs_off(struct mm_struct
*/
if (prev == next) {
/* Not actually switching mm's */
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
- next->context.ctx_id);
+ VM_WARN_ON(is_dyn_asid(prev_asid) &&
+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ next->context.ctx_id);
/*
* If this races with another thread that enables lam, 'new_lam'
@@ -574,6 +857,23 @@ void switch_mm_irqs_off(struct mm_struct
cpumask_set_cpu(cpu, mm_cpumask(next));
/*
+ * Check if the current mm is transitioning to a new ASID.
+ */
+ if (needs_global_asid_reload(next, prev_asid)) {
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ goto reload_tlb;
+ }
+
+ /*
+ * Broadcast TLB invalidation keeps this PCID up to date
+ * all the time.
+ */
+ if (is_global_asid(prev_asid))
+ return;
+
+ /*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
* process. No TLB flush required.
@@ -607,6 +907,13 @@ void switch_mm_irqs_off(struct mm_struct
cond_mitigation(tsk);
/*
+ * Let nmi_uaccess_okay() and finish_asid_transition()
+ * know that we're changing CR3.
+ */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+
+ /*
* Stop remote flushes for the previous mm.
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
* but the bitmap manipulation can cause cache line contention.
@@ -623,14 +930,12 @@ void switch_mm_irqs_off(struct mm_struct
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
-
- /* Let nmi_uaccess_okay() know that we're changing CR3. */
- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
- barrier();
}
+reload_tlb:
new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
+ VM_WARN_ON_ONCE(is_global_asid(new_asid));
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
@@ -749,7 +1054,7 @@ static void flush_tlb_func(void *info)
const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ u64 local_tlb_gen;
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
u64 mm_tlb_gen;
@@ -769,6 +1074,16 @@ static void flush_tlb_func(void *info)
if (unlikely(loaded_mm == &init_mm))
return;
+ /* Reload the ASID if transitioning into or out of a global ASID */
+ if (needs_global_asid_reload(loaded_mm, loaded_mm_asid)) {
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ }
+
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
+ if (is_global_asid(loaded_mm_asid))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
@@ -786,6 +1101,8 @@ static void flush_tlb_func(void *info)
return;
}
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) {
/*
@@ -953,7 +1270,7 @@ STATIC_NOPV void native_flush_tlb_multi(
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables)
+ if (info->freed_tables || in_asid_transition(info->mm))
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
@@ -1058,9 +1375,12 @@ void flush_tlb_mm_range(struct mm_struct
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ if (mm_global_asid(mm)) {
+ broadcast_tlb_flush(info);
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
+ consider_global_asid(mm);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();

View File

@@ -1,251 +0,0 @@
From 6f601cdcd33be8fc0da98c6bab777575af3260b8 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:29 -0500
Subject: x86/mm: do targeted broadcast flushing from tlbbatch code
Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
This also allows us to avoid adding the CPUs of processes using broadcast
flushing to the batch->cpumask, and will hopefully further reduce TLB
flushing from the reclaim and compaction paths.
Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
---
arch/x86/include/asm/invlpgb.h | 21 +++++----
arch/x86/include/asm/tlbflush.h | 17 ++++---
arch/x86/mm/tlb.c | 80 +++++++++++++++++++++++++++++++--
3 files changed, 95 insertions(+), 23 deletions(-)
--- a/arch/x86/include/asm/invlpgb.h
+++ b/arch/x86/include/asm/invlpgb.h
@@ -31,9 +31,8 @@ static inline void __invlpgb(unsigned lo
}
/* Wait for INVLPGB originated by this CPU to complete. */
-static inline void tlbsync(void)
+static inline void __tlbsync(void)
{
- cant_migrate();
/* TLBSYNC: supported in binutils >= 0.36. */
asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
}
@@ -61,19 +60,19 @@ static inline void invlpgb_flush_user(un
unsigned long addr)
{
__invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
- tlbsync();
+ __tlbsync();
}
-static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
- unsigned long addr,
- u16 nr,
- bool pmd_stride)
+static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ unsigned long addr,
+ u16 nr,
+ bool pmd_stride)
{
__invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
}
/* Flush all mappings for a given PCID, not including globals. */
-static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+static inline void __invlpgb_flush_single_pcid_nosync(unsigned long pcid)
{
__invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
}
@@ -82,11 +81,11 @@ static inline void invlpgb_flush_single_
static inline void invlpgb_flush_all(void)
{
__invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
- tlbsync();
+ __tlbsync();
}
/* Flush addr, including globals, for all PCIDs. */
-static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+static inline void __invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
{
__invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
}
@@ -95,7 +94,7 @@ static inline void invlpgb_flush_addr_no
static inline void invlpgb_flush_all_nonglobals(void)
{
__invlpgb(0, 0, 0, 0, 0, 0);
- tlbsync();
+ __tlbsync();
}
#endif /* _ASM_X86_INVLPGB */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -106,6 +106,7 @@ struct tlb_state {
* need to be invalidated.
*/
bool invalidate_other;
+ bool need_tlbsync;
#ifdef CONFIG_ADDRESS_MASKING
/*
@@ -310,6 +311,10 @@ static inline void broadcast_tlb_flush(s
static inline void consider_global_asid(struct mm_struct *mm)
{
}
+
+static inline void tlbsync(void)
+{
+}
#endif
#ifdef CONFIG_PARAVIRT
@@ -359,21 +364,15 @@ static inline u64 inc_mm_tlb_gen(struct
return atomic64_inc_return(&mm->context.tlb_gen);
}
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
- struct mm_struct *mm,
- unsigned long uaddr)
-{
- inc_mm_tlb_gen(mm);
- cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
- mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
-}
-
static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
{
flush_tlb_mm(mm);
}
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr);
static inline bool pte_flags_need_flush(unsigned long oldflags,
unsigned long newflags,
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -488,6 +488,37 @@ static void finish_asid_transition(struc
WRITE_ONCE(mm->context.asid_transition, false);
}
+static inline void tlbsync(void)
+{
+ if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+ return;
+ __tlbsync();
+ this_cpu_write(cpu_tlbstate.need_tlbsync, false);
+}
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ unsigned long addr,
+ u16 nr, bool pmd_stride)
+{
+ __invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
+ if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+ this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+}
+
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+ __invlpgb_flush_single_pcid_nosync(pcid);
+ if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+ this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+}
+
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+ __invlpgb_flush_addr_nosync(addr, nr);
+ if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
+ this_cpu_write(cpu_tlbstate.need_tlbsync, true);
+}
+
static void broadcast_tlb_flush(struct flush_tlb_info *info)
{
bool pmd = info->stride_shift == PMD_SHIFT;
@@ -794,6 +825,8 @@ void switch_mm_irqs_off(struct mm_struct
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
+ tlbsync();
+
/*
* Verify that CR3 is what we think it is. This will catch
* hypothetical buggy code that directly switches to swapper_pg_dir
@@ -976,6 +1009,8 @@ reload_tlb:
*/
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
+ tlbsync();
+
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
@@ -1650,9 +1685,7 @@ void arch_tlbbatch_flush(struct arch_tlb
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
- invlpgb_flush_all_nonglobals();
- } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
@@ -1661,12 +1694,53 @@ void arch_tlbbatch_flush(struct arch_tlb
local_irq_enable();
}
+ /*
+ * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+ * The cpumask above contains only CPUs that were running tasks
+ * not using broadcast TLB flushing.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ tlbsync();
+
cpumask_clear(&batch->cpumask);
put_flush_tlb_info();
put_cpu();
}
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr)
+{
+ u16 asid = mm_global_asid(mm);
+
+ if (asid) {
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+
+ /*
+ * Some CPUs might still be using a local ASID for this
+ * process, and require IPIs, while others are using the
+ * global ASID.
+ *
+ * In this corner case we need to do both the broadcast
+ * TLB invalidation, and send IPIs. The IPIs will help
+ * stragglers transition to the broadcast ASID.
+ */
+ if (in_asid_transition(mm))
+ asid = 0;
+ }
+
+ if (!asid) {
+ inc_mm_tlb_gen(mm);
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ }
+
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
/*
* Blindly accessing user memory from NMI context can be dangerous
* if we're in the middle of switching the current user task or

View File

@@ -1,80 +0,0 @@
From 7b8ef03b059bca98d2af696c3ec2adcaa673f7e4 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:31 -0500
Subject: x86/mm: only invalidate final translations with INVLPGB
Use the INVLPGB_FINAL_ONLY flag when invalidating mappings with INVPLGB.
This way only leaf mappings get removed from the TLB, leaving intermediate
translations cached.
On the (rare) occasions where we free page tables we do a full flush,
ensuring intermediate translations get flushed from the TLB.
Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
---
arch/x86/include/asm/invlpgb.h | 10 ++++++++--
arch/x86/mm/tlb.c | 13 +++++++------
2 files changed, 15 insertions(+), 8 deletions(-)
--- a/arch/x86/include/asm/invlpgb.h
+++ b/arch/x86/include/asm/invlpgb.h
@@ -66,9 +66,15 @@ static inline void invlpgb_flush_user(un
static inline void __invlpgb_flush_user_nr_nosync(unsigned long pcid,
unsigned long addr,
u16 nr,
- bool pmd_stride)
+ bool pmd_stride,
+ bool freed_tables)
{
- __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
+ u8 flags = INVLPGB_PCID | INVLPGB_VA;
+
+ if (!freed_tables)
+ flags |= INVLPGB_FINAL_ONLY;
+
+ __invlpgb(0, pcid, addr, nr - 1, pmd_stride, flags);
}
/* Flush all mappings for a given PCID, not including globals. */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -498,9 +498,10 @@ static inline void tlbsync(void)
static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
unsigned long addr,
- u16 nr, bool pmd_stride)
+ u16 nr, bool pmd_stride,
+ bool freed_tables)
{
- __invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride);
+ __invlpgb_flush_user_nr_nosync(pcid, addr, nr, pmd_stride, freed_tables);
if (!this_cpu_read(cpu_tlbstate.need_tlbsync))
this_cpu_write(cpu_tlbstate.need_tlbsync, true);
}
@@ -549,10 +550,10 @@ static void broadcast_tlb_flush(struct f
nr = min(maxnr, (info->end - addr) >> info->stride_shift);
nr = max(nr, 1);
- invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd, info->freed_tables);
/* Do any CPUs supporting INVLPGB need PTI? */
if (static_cpu_has(X86_FEATURE_PTI))
- invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd, info->freed_tables);
addr += nr << info->stride_shift;
} while (addr < info->end);
@@ -1715,10 +1716,10 @@ void arch_tlbbatch_add_pending(struct ar
u16 asid = mm_global_asid(mm);
if (asid) {
- invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false, false);
/* Do any CPUs supporting INVLPGB need PTI? */
if (static_cpu_has(X86_FEATURE_PTI))
- invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false, false);
/*
* Some CPUs might still be using a local ASID for this

View File

@@ -1,94 +0,0 @@
From 7b0836fcad644d24d6318bf63013ec1b35d6a27b Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Thu, 19 Dec 2024 15:32:53 -0500
Subject: mm: remove unnecessary calls to lru_add_drain
There seem to be several categories of calls to lru_add_drain
and lru_add_drain_all.
The first are code paths that recently allocated, swapped in,
or otherwise processed a batch of pages, and want them all on
the LRU. These drain pages that were recently allocated,
probably on the local CPU.
A second category are code paths that are actively trying to
reclaim, migrate, or offline memory. These often use lru_add_drain_all,
to drain the caches on all CPUs.
However, there also seem to be some other callers where we
aren't really doing either. They are calling lru_add_drain(),
despite operating on pages that may have been allocated
long ago, and quite possibly on different CPUs.
Those calls are not likely to be effective at anything but
creating lock contention on the LRU locks.
Remove the lru_add_drain calls in the latter category.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
---
mm/memory.c | 1 -
mm/mmap.c | 2 --
mm/swap_state.c | 1 -
mm/vma.c | 2 --
4 files changed, 6 deletions(-)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1921,7 +1921,6 @@ void zap_page_range_single(struct vm_are
struct mmu_notifier_range range;
struct mmu_gather tlb;
- lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1931,7 +1931,6 @@ void exit_mmap(struct mm_struct *mm)
goto destroy;
}
- lru_add_drain();
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
@@ -2374,7 +2373,6 @@ int relocate_vma_down(struct vm_area_str
vma, new_start, length, false, true))
return -ENOMEM;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
next = vma_next(&vmi);
if (new_end > old_start) {
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct en
struct folio_batch folios;
unsigned int refs[PAGEVEC_SIZE];
- lru_add_drain();
folio_batch_init(&folios);
for (int i = 0; i < nr; i++) {
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -347,7 +347,6 @@ void unmap_region(struct ma_state *mas,
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
@@ -1089,7 +1088,6 @@ static inline void vms_clear_ptes(struct
* were isolated before we downgraded mmap_lock.
*/
mas_set(mas_detach, 1);
- lru_add_drain();
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
update_hiwater_rss(vms->vma->vm_mm);
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,

View File

@@ -1,429 +0,0 @@
From 7ecab5a83d3155baa009cd6bc6e18959fee8be62 Mon Sep 17 00:00:00 2001
From: Vincenzo Frascino <vincenzo.frascino@arm.com>
Date: Mon, 14 Oct 2024 16:13:39 +0100
Subject: vdso: Introduce vdso/page.h
The VDSO implementation includes headers from outside of the
vdso/ namespace.
Introduce vdso/page.h to make sure that the generic library
uses only the allowed namespace.
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k
Link: https://lore.kernel.org/all/20241014151340.1639555-3-vincenzo.frascino@arm.com
---
arch/alpha/include/asm/page.h | 6 +-----
arch/arc/include/uapi/asm/page.h | 7 +++----
arch/arm/include/asm/page.h | 5 +----
arch/arm64/include/asm/page-def.h | 5 +----
arch/csky/include/asm/page.h | 8 ++------
arch/hexagon/include/asm/page.h | 4 +---
arch/loongarch/include/asm/page.h | 7 +------
arch/m68k/include/asm/page.h | 6 ++----
arch/microblaze/include/asm/page.h | 5 +----
arch/mips/include/asm/page.h | 7 +------
arch/nios2/include/asm/page.h | 7 +------
arch/openrisc/include/asm/page.h | 11 +----------
arch/parisc/include/asm/page.h | 4 +---
arch/powerpc/include/asm/page.h | 10 +---------
arch/riscv/include/asm/page.h | 4 +---
arch/s390/include/asm/page.h | 13 +++++--------
arch/sh/include/asm/page.h | 6 ++----
arch/sparc/include/asm/page_32.h | 4 +---
arch/sparc/include/asm/page_64.h | 4 +---
arch/um/include/asm/page.h | 5 +----
arch/x86/include/asm/page_types.h | 5 +----
arch/xtensa/include/asm/page.h | 8 +-------
include/vdso/page.h | 30 ++++++++++++++++++++++++++++++
23 files changed, 61 insertions(+), 110 deletions(-)
create mode 100644 include/vdso/page.h
--- a/arch/alpha/include/asm/page.h
+++ b/arch/alpha/include/asm/page.h
@@ -4,11 +4,7 @@
#include <linux/const.h>
#include <asm/pal.h>
-
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
--- a/arch/arc/include/uapi/asm/page.h
+++ b/arch/arc/include/uapi/asm/page.h
@@ -14,7 +14,7 @@
/* PAGE_SHIFT determines the page size */
#ifdef __KERNEL__
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
+#include <vdso/page.h>
#else
/*
* Default 8k
@@ -24,11 +24,10 @@
* not available
*/
#define PAGE_SHIFT 13
+#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */
+#define PAGE_MASK (~(PAGE_SIZE-1))
#endif
-#define PAGE_SIZE _BITUL(PAGE_SHIFT) /* Default 8K */
#define PAGE_OFFSET _AC(0x80000000, UL) /* Kernel starts at 2G onwrds */
-#define PAGE_MASK (~(PAGE_SIZE-1))
-
#endif /* _UAPI__ASM_ARC_PAGE_H */
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -7,10 +7,7 @@
#ifndef _ASMARM_PAGE_H
#define _ASMARM_PAGE_H
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
--- a/arch/arm64/include/asm/page-def.h
+++ b/arch/arm64/include/asm/page-def.h
@@ -10,9 +10,6 @@
#include <linux/const.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#endif /* __ASM_PAGE_DEF_H */
--- a/arch/csky/include/asm/page.h
+++ b/arch/csky/include/asm/page.h
@@ -7,12 +7,8 @@
#include <asm/cache.h>
#include <linux/const.h>
-/*
- * PAGE_SHIFT determines the page size: 4KB
- */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
+#include <vdso/page.h>
+
#define THREAD_SIZE (PAGE_SIZE * 2)
#define THREAD_MASK (~(THREAD_SIZE - 1))
#define THREAD_SHIFT (PAGE_SHIFT + 1)
--- a/arch/hexagon/include/asm/page.h
+++ b/arch/hexagon/include/asm/page.h
@@ -45,9 +45,7 @@
#define HVM_HUGEPAGE_SIZE 0x5
#endif
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (1UL << PAGE_SHIFT)
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
+#include <vdso/page.h>
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
--- a/arch/loongarch/include/asm/page.h
+++ b/arch/loongarch/include/asm/page.h
@@ -8,12 +8,7 @@
#include <linux/const.h>
#include <asm/addrspace.h>
-/*
- * PAGE_SHIFT determines the page size
- */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
+#include <vdso/page.h>
#define HPAGE_SHIFT (PAGE_SHIFT + PAGE_SHIFT - 3)
#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
--- a/arch/m68k/include/asm/page.h
+++ b/arch/m68k/include/asm/page.h
@@ -6,10 +6,8 @@
#include <asm/setup.h>
#include <asm/page_offset.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
+
#define PAGE_OFFSET (PAGE_OFFSET_RAW)
#ifndef __ASSEMBLY__
--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
@@ -19,10 +19,7 @@
#ifdef __KERNEL__
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#define LOAD_OFFSET ASM_CONST((CONFIG_KERNEL_START-CONFIG_KERNEL_BASE_ADDR))
--- a/arch/mips/include/asm/page.h
+++ b/arch/mips/include/asm/page.h
@@ -14,12 +14,7 @@
#include <linux/kernel.h>
#include <asm/mipsregs.h>
-/*
- * PAGE_SHIFT determines the page size
- */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
+#include <vdso/page.h>
/*
* This is used for calculating the real page sizes
--- a/arch/nios2/include/asm/page.h
+++ b/arch/nios2/include/asm/page.h
@@ -18,12 +18,7 @@
#include <linux/pfn.h>
#include <linux/const.h>
-/*
- * PAGE_SHIFT determines the page size
- */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
+#include <vdso/page.h>
/*
* PAGE_OFFSET -- the first address of the first page of memory.
--- a/arch/openrisc/include/asm/page.h
+++ b/arch/openrisc/include/asm/page.h
@@ -15,16 +15,7 @@
#ifndef __ASM_OPENRISC_PAGE_H
#define __ASM_OPENRISC_PAGE_H
-
-/* PAGE_SHIFT determines the page size */
-
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#ifdef __ASSEMBLY__
-#define PAGE_SIZE (1 << PAGE_SHIFT)
-#else
-#define PAGE_SIZE (1UL << PAGE_SHIFT)
-#endif
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#define PAGE_OFFSET 0xc0000000
#define KERNELBASE PAGE_OFFSET
--- a/arch/parisc/include/asm/page.h
+++ b/arch/parisc/include/asm/page.h
@@ -4,9 +4,7 @@
#include <linux/const.h>
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -21,8 +21,7 @@
* page size. When using 64K pages however, whether we are really supporting
* 64K pages in HW or not is irrelevant to those definitions.
*/
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT)
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
#ifndef CONFIG_HUGETLB_PAGE
@@ -42,13 +41,6 @@ extern unsigned int hpage_shift;
#endif
/*
- * Subtle: (1 << PAGE_SHIFT) is an int, not an unsigned long. So if we
- * assign PAGE_MASK to a larger type it gets extended the way we want
- * (i.e. with 1s in the high bits)
- */
-#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
-
-/*
* KERNELBASE is the virtual address of the start of the kernel, it's often
* the same as PAGE_OFFSET, but _might not be_.
*
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -12,9 +12,7 @@
#include <linux/pfn.h>
#include <linux/const.h>
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
+#include <vdso/page.h>
#define HPAGE_SHIFT PMD_SHIFT
#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -11,14 +11,11 @@
#include <linux/const.h>
#include <asm/types.h>
-#define _PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT)
-#define _PAGE_MASK (~(_PAGE_SIZE - 1))
+#include <vdso/page.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT _PAGE_SHIFT
-#define PAGE_SIZE _PAGE_SIZE
-#define PAGE_MASK _PAGE_MASK
+#define _PAGE_SHIFT PAGE_SHIFT
+#define _PAGE_SIZE PAGE_SIZE
+#define _PAGE_MASK PAGE_MASK
#define PAGE_DEFAULT_ACC _AC(0, UL)
/* storage-protection override */
#define PAGE_SPO_ACC 9
--- a/arch/sh/include/asm/page.h
+++ b/arch/sh/include/asm/page.h
@@ -8,10 +8,8 @@
#include <linux/const.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
+
#define PTE_MASK PAGE_MASK
#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
--- a/arch/sparc/include/asm/page_32.h
+++ b/arch/sparc/include/asm/page_32.h
@@ -11,9 +11,7 @@
#include <linux/const.h>
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -4,9 +4,7 @@
#include <linux/const.h>
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
/* Flushing for D-cache alias handling is only needed if
* the page size is smaller than 16K.
--- a/arch/um/include/asm/page.h
+++ b/arch/um/include/asm/page.h
@@ -9,10 +9,7 @@
#include <linux/const.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#ifndef __ASSEMBLY__
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -6,10 +6,7 @@
#include <linux/types.h>
#include <linux/mem_encrypt.h>
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
--- a/arch/xtensa/include/asm/page.h
+++ b/arch/xtensa/include/asm/page.h
@@ -18,13 +18,7 @@
#include <asm/cache.h>
#include <asm/kmem_layout.h>
-/*
- * PAGE_SHIFT determines the page size
- */
-
-#define PAGE_SHIFT CONFIG_PAGE_SHIFT
-#define PAGE_SIZE (__XTENSA_UL_CONST(1) << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE-1))
+#include <vdso/page.h>
#ifdef CONFIG_MMU
#define PAGE_OFFSET XCHAL_KSEG_CACHED_VADDR
--- /dev/null
+++ b/include/vdso/page.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_PAGE_H
+#define __VDSO_PAGE_H
+
+#include <uapi/linux/const.h>
+
+/*
+ * PAGE_SHIFT determines the page size.
+ *
+ * Note: This definition is required because PAGE_SHIFT is used
+ * in several places throuout the codebase.
+ */
+#define PAGE_SHIFT CONFIG_PAGE_SHIFT
+
+#define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT)
+
+#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT)
+/*
+ * Applies only to 32-bit architectures with a 64-bit phys_addr_t.
+ *
+ * Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long.
+ * So if we assign PAGE_MASK to a larger type it gets extended the
+ * way we want (i.e. with 1s in the high bits)
+ */
+#define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1))
+#else
+#define PAGE_MASK (~(PAGE_SIZE - 1))
+#endif
+
+#endif /* __VDSO_PAGE_H */

View File

@@ -1,68 +0,0 @@
From d1bcf51400e790e65945a29078bd816bd61aa148 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 24 Oct 2024 13:34:26 +0000
Subject: vdso: Change PAGE_MASK to signed on all 32-bit architectures
With the introduction of an architecture-independent defintion of
PAGE_MASK, we had to make a choice between defining it as 'unsigned long'
as on 64-bit architectures, or as signed 'long' as required for
architectures with a 64-bit phys_addr_t.
To reduce the risk for regressions and minimize the changes in behavior,
the result was using the signed value only when CONFIG_PHYS_ADDR_T_64BIT
is set, but that ended up causing a regression after all in the
early_init_dt_add_memory_arch() function that uses 64-bit integers for
address calculation.
Presumably the same regression also affects mips32 and powerpc32 when
dealing with large amounts of memory on DT platforms: like arm32, they were
using the signed version unconditionally.
The two most sensible options for addressing the regression are either to
go back to an architecture specific definition, using a signed constant on
arm/powerpc/mips and unsigned on the others, or to use the same definition
everywhere.
Use the simpler of those two and change them all to the signed version, in
the hope that this does not cause a different type of bug. Most of the
other 32-bit architectures have no large physical address support and are
rarely used, so it seems more likely that using the same definition helps
than hurts here.
In particular, x86-32 does have physical addressing extensions, so it
already changed to the signed version after the previous patch, so it makes
sense to use the same version on non-PAE as well.
Fixes: efe8419ae78d ("vdso: Introduce vdso/page.h")
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Anders Roxell <anders.roxell@linaro.org>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Link: https://lore.kernel.org/all/20241024133447.3117273-1-arnd@kernel.org
Link: https://lore.kernel.org/lkml/CA+G9fYt86bUAu_v5dXPWnDUwQNVipj+Wq3Djir1KUSKdr9QLNg@mail.gmail.com/
---
include/vdso/page.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/include/vdso/page.h
+++ b/include/vdso/page.h
@@ -14,13 +14,14 @@
#define PAGE_SIZE (_AC(1,UL) << CONFIG_PAGE_SHIFT)
-#if defined(CONFIG_PHYS_ADDR_T_64BIT) && !defined(CONFIG_64BIT)
+#if !defined(CONFIG_64BIT)
/*
- * Applies only to 32-bit architectures with a 64-bit phys_addr_t.
+ * Applies only to 32-bit architectures.
*
* Subtle: (1 << CONFIG_PAGE_SHIFT) is an int, not an unsigned long.
* So if we assign PAGE_MASK to a larger type it gets extended the
- * way we want (i.e. with 1s in the high bits)
+ * way we want (i.e. with 1s in the high bits) while masking a
+ * 64-bit value such as phys_addr_t.
*/
#define PAGE_MASK (~((1 << CONFIG_PAGE_SHIFT) - 1))
#else

View File

@@ -1,7 +1,7 @@
From e11153c4df0fee7caadec3714a60a4936d6a9ea2 Mon Sep 17 00:00:00 2001
From 1901291057a3f1bf2bf94c7a4ddf3253d3116acb Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:20 -0500
Subject: x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional
Date: Thu, 13 Feb 2025 11:13:52 -0500
Subject: x86/mm: Make MMU_GATHER_RCU_TABLE_FREE unconditional
Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using
paravirt, and not when running on bare metal.
@@ -11,8 +11,9 @@ each setup. Make them all the same.
Currently get_user_pages_fast synchronizes against page table
freeing in two different ways:
- on bare metal, by blocking IRQs, which block TLB flush IPIs
- on paravirt, with MMU_GATHER_RCU_TABLE_FREE
- on bare metal, by blocking IRQs, which block TLB flush IPIs
- on paravirt, with MMU_GATHER_RCU_TABLE_FREE
This is done because some paravirt TLB flush implementations
handle the TLB flush in the hypervisor, and will do the flush
@@ -27,18 +28,22 @@ as an implicit way to block RCU frees.
That makes it safe to use INVLPGB on AMD CPUs.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
Tested-by: Brendan Jackman <jackmanb@google.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Link: https://lore.kernel.org/r/20250213161423.449435-2-riel@surriel.com
---
arch/x86/Kconfig | 2 +-
arch/x86/kernel/paravirt.c | 7 +------
arch/x86/mm/pgtable.c | 16 ++++------------
3 files changed, 6 insertions(+), 19 deletions(-)
arch/x86/kernel/paravirt.c | 17 +----------------
arch/x86/mm/pgtable.c | 27 ++++-----------------------
3 files changed, 6 insertions(+), 40 deletions(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -270,7 +270,7 @@ config X86
@@ -277,7 +277,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
@@ -49,19 +54,29 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com>
select HAVE_REGS_AND_STACK_ACCESS_API
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void)
@@ -59,21 +59,6 @@ void __init native_pv_lock_init(void)
static_branch_enable(&virt_spin_lock_key);
}
-#ifndef CONFIG_PT_RECLAIM
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_page(tlb, table);
- struct ptdesc *ptdesc = (struct ptdesc *)table;
-
- pagetable_dtor(ptdesc);
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#else
-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_table(tlb, table);
-}
-#endif
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -191,7 +186,7 @@ struct paravirt_patch_template pv_ops =
@@ -195,7 +180,7 @@ struct paravirt_patch_template pv_ops =
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,
@@ -72,53 +87,63 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com>
.mmu.notify_page_enc_status_changed = paravirt_nop,
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask);
@@ -18,25 +18,6 @@ EXPORT_SYMBOL(physical_mask);
#define PGTABLE_HIGHMEM 0
#endif
-#ifndef CONFIG_PARAVIRT
-#ifndef CONFIG_PT_RECLAIM
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_page(tlb, table);
- struct ptdesc *ptdesc = (struct ptdesc *)table;
-
- pagetable_dtor(ptdesc);
- tlb_remove_page(tlb, ptdesc_page(ptdesc));
-}
-#endif
-#else
-static inline
-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- tlb_remove_table(tlb, table);
-}
-#endif /* !CONFIG_PT_RECLAIM */
-#endif /* !CONFIG_PARAVIRT */
-
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
pgtable_t pte_alloc_one(struct mm_struct *mm)
@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather *
@@ -64,7 +45,7 @@ early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pagetable_pte_dtor(page_ptdesc(pte));
paravirt_release_pte(page_to_pfn(pte));
- paravirt_tlb_remove_table(tlb, pte);
+ tlb_remove_table(tlb, pte);
- paravirt_tlb_remove_table(tlb, page_ptdesc(pte));
+ tlb_remove_table(tlb, page_ptdesc(pte));
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather *
@@ -78,21 +59,21 @@ void ___pmd_free_tlb(struct mmu_gather *
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
pagetable_pmd_dtor(ptdesc);
- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
+ tlb_remove_table(tlb, ptdesc_page(ptdesc));
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd));
+ tlb_remove_table(tlb, virt_to_ptdesc(pmd));
}
#if CONFIG_PGTABLE_LEVELS > 3
@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather *
pagetable_pud_dtor(ptdesc);
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_page(pud));
+ tlb_remove_table(tlb, virt_to_page(pud));
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud));
+ tlb_remove_table(tlb, virt_to_ptdesc(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
+ tlb_remove_table(tlb, virt_to_page(p4d));
- paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d));
+ tlb_remove_table(tlb, virt_to_ptdesc(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */

View File

@@ -1,16 +1,20 @@
From e8008cb69c5e4efbaedd70b0fb692343e4aa0e51 Mon Sep 17 00:00:00 2001
From 002a3e971d0d7987bdcdd564eccfa3dd63637226 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:21 -0500
Subject: x86/mm: remove pv_ops.mmu.tlb_remove_table call
Date: Thu, 13 Feb 2025 11:13:53 -0500
Subject: x86/mm: Remove pv_ops.mmu.tlb_remove_table call
Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
Get rid of the indirection by simply calling tlb_remove_table directly,
and not going through the paravirt function pointers.
Signed-off-by: Rik van Riel <riel@surriel.com>
Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
Tested-by: Brendan Jackman <jackmanb@google.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Link: https://lore.kernel.org/r/20250213161423.449435-3-riel@surriel.com
---
arch/x86/hyperv/mmu.c | 1 -
arch/x86/include/asm/paravirt.h | 5 -----
@@ -22,7 +26,7 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com>
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void)
@@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void)
pr_info("Using hypercall for remote TLB flush\n");
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
@@ -44,7 +48,7 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com>
PVOP_VCALL1(mmu.exit_mmap, mm);
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -136,8 +136,6 @@ struct pv_mmu_ops {
@@ -134,8 +134,6 @@ struct pv_mmu_ops {
void (*flush_tlb_multi)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
@@ -65,7 +69,7 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com>
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -186,7 +186,6 @@ struct paravirt_patch_template pv_ops =
@@ -180,7 +180,6 @@ struct paravirt_patch_template pv_ops =
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_multi = native_flush_tlb_multi,

View File

@@ -0,0 +1,87 @@
From d4784e28cc2e488fce80ded0ff086c50244593ca Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:36 -0500
Subject: x86/mm: Consolidate full flush threshold decision
Reduce code duplication by consolidating the decision point for whether to do
individual invalidations or a full flush inside get_flush_tlb_info().
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lore.kernel.org/r/20250226030129.530345-2-riel@surriel.com
---
arch/x86/mm/tlb.c | 41 +++++++++++++++++++----------------------
1 file changed, 19 insertions(+), 22 deletions(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1000,6 +1000,15 @@ static struct flush_tlb_info *get_flush_
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
#endif
+ /*
+ * If the number of flushes is so large that a full flush
+ * would be faster, do a full flush.
+ */
+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
+ start = 0;
+ end = TLB_FLUSH_ALL;
+ }
+
info->start = start;
info->end = end;
info->mm = mm;
@@ -1026,17 +1035,8 @@ void flush_tlb_mm_range(struct mm_struct
bool freed_tables)
{
struct flush_tlb_info *info;
+ int cpu = get_cpu();
u64 new_tlb_gen;
- int cpu;
-
- cpu = get_cpu();
-
- /* Should we flush just the requested range? */
- if ((end == TLB_FLUSH_ALL) ||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
- start = 0;
- end = TLB_FLUSH_ALL;
- }
/* This is also a barrier that synchronizes with switch_mm(). */
new_tlb_gen = inc_mm_tlb_gen(mm);
@@ -1089,22 +1089,19 @@ static void do_kernel_range_flush(void *
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- /* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
- on_each_cpu(do_flush_tlb_all, NULL, 1);
- } else {
- struct flush_tlb_info *info;
+ struct flush_tlb_info *info;
- preempt_disable();
- info = get_flush_tlb_info(NULL, start, end, 0, false,
- TLB_GENERATION_INVALID);
+ guard(preempt)();
+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
+ TLB_GENERATION_INVALID);
+
+ if (info->end == TLB_FLUSH_ALL)
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ else
on_each_cpu(do_kernel_range_flush, info, 1);
- put_flush_tlb_info();
- preempt_enable();
- }
+ put_flush_tlb_info();
}
/*

View File

@@ -0,0 +1,103 @@
From e5d151337c384934c9b669967d72f9b29781b308 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:37 -0500
Subject: x86/mm: Add INVLPGB feature and Kconfig entry
In addition, the CPU advertises the maximum number of pages that can be
shot down with one INVLPGB instruction in CPUID. Save that information
for later use.
[ bp: use cpu_has(), typos, massage. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-3-riel@surriel.com
---
arch/x86/Kconfig.cpu | 4 ++++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/disabled-features.h | 8 +++++++-
arch/x86/include/asm/tlbflush.h | 3 +++
arch/x86/kernel/cpu/amd.c | 6 ++++++
5 files changed, 21 insertions(+), 1 deletion(-)
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -740,6 +740,10 @@ menuconfig PROCESSOR_SELECT
This lets you choose what x86 vendor support code your kernel
will include.
+config BROADCAST_TLB_FLUSH
+ def_bool y
+ depends on CPU_SUP_AMD && 64BIT
+
config CPU_SUP_INTEL
default y
bool "Support Intel processors" if PROCESSOR_SELECT
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -338,6 +338,7 @@
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -129,6 +129,12 @@
#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31))
#endif
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+#define DISABLE_INVLPGB 0
+#else
+#define DISABLE_INVLPGB (1 << (X86_FEATURE_INVLPGB & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -146,7 +152,7 @@
#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM)
-#define DISABLED_MASK13 0
+#define DISABLED_MASK13 (DISABLE_INVLPGB)
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -183,6 +183,9 @@ static inline void cr4_init_shadow(void)
extern unsigned long mmu_cr4_features;
extern u32 *trampoline_cr4_features;
+/* How many pages can be invalidated with one INVLPGB. */
+extern u16 invlpgb_count_max;
+
extern void initialize_tlbstate_and_flush(void);
/*
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -29,6 +29,8 @@
#include "cpu.h"
+u16 invlpgb_count_max __ro_after_init;
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -1139,6 +1141,10 @@ static void cpu_detect_tlb_amd(struct cp
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
}
static const struct cpu_dev amd_cpu_dev = {

View File

@@ -0,0 +1,170 @@
From 9bbface3289771c5990e97ca047a52faaeafdb83 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Fri, 28 Feb 2025 20:32:30 +0100
Subject: x86/mm: Add INVLPGB support code
Add helper functions and definitions needed to use broadcast TLB
invalidation on AMD CPUs.
[ bp:
- Cleanup commit message
- Improve and expand comments
- push the preemption guards inside the invlpgb* helpers
- merge improvements from dhansen
- add !CONFIG_BROADCAST_TLB_FLUSH function stubs because Clang
can't do DCE properly yet and looks at the inline asm and
complains about it getting a u64 argument on 32-bit code ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-4-riel@surriel.com
---
arch/x86/include/asm/tlb.h | 132 +++++++++++++++++++++++++++++++++++++
1 file changed, 132 insertions(+)
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -6,6 +6,9 @@
static inline void tlb_flush(struct mmu_gather *tlb);
#include <asm-generic/tlb.h>
+#include <linux/kernel.h>
+#include <vdso/bits.h>
+#include <vdso/page.h>
static inline void tlb_flush(struct mmu_gather *tlb)
{
@@ -25,4 +28,133 @@ static inline void invlpg(unsigned long
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
}
+enum addr_stride {
+ PTE_STRIDE = 0,
+ PMD_STRIDE = 1
+};
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+/*
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
+ *
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
+ * be done in a parallel fashion.
+ *
+ * The instruction takes the number of extra pages to invalidate, beyond
+ * the first page, while __invlpgb gets the more human readable number of
+ * pages to invalidate.
+ *
+ * The bits in rax[0:2] determine respectively which components of the address
+ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
+ * address in the specified range matches.
+ *
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
+ * this CPU have completed.
+ */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 nr_pages,
+ enum addr_stride stride, u8 flags)
+{
+ u32 edx = (pcid << 16) | asid;
+ u32 ecx = (stride << 31) | (nr_pages - 1);
+ u64 rax = addr | flags;
+
+ /* The low bits in rax are for flags. Verify addr is clean. */
+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
+
+ /* INVLPGB; supported in binutils >= 2.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx));
+}
+
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags)
+{
+ __invlpgb(asid, pcid, 0, 1, 0, flags);
+}
+
+static inline void __tlbsync(void)
+{
+ /*
+ * TLBSYNC waits for INVLPGB instructions originating on the same CPU
+ * to have completed. Print a warning if the task has been migrated,
+ * and might not be waiting on all the INVLPGBs issued during this TLB
+ * invalidation sequence.
+ */
+ cant_migrate();
+
+ /* TLBSYNC: supported in binutils >= 0.36. */
+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
+}
+#else
+/* Some compilers (I'm looking at you clang!) simply can't do DCE */
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
+ unsigned long addr, u16 nr_pages,
+ enum addr_stride s, u8 flags) { }
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { }
+static inline void __tlbsync(void) { }
+#endif
+
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
+ *
+ * The first is used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_FLAG_VA BIT(0)
+#define INVLPGB_FLAG_PCID BIT(1)
+#define INVLPGB_FLAG_ASID BIT(2)
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
+
+/* The implied mode when all bits are clear: */
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
+
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
+ unsigned long addr,
+ u16 nr, bool stride)
+{
+ enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE;
+ u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA;
+
+ __invlpgb(0, pcid, addr, nr, str, flags);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
+{
+ __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invlpgb_flush_all(void)
+{
+ /*
+ * TLBSYNC at the end needs to make sure all flushes done on the
+ * current CPU have been executed system-wide. Therefore, make
+ * sure nothing gets migrated in-between but disable preemption
+ * as it is cheaper.
+ */
+ guard(preempt)();
+ __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL);
+ __tlbsync();
+}
+
+/* Flush addr, including globals, for all PCIDs. */
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
+{
+ __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invlpgb_flush_all_nonglobals(void)
+{
+ guard(preempt)();
+ __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS);
+ __tlbsync();
+}
#endif /* _ASM_X86_TLB_H */

View File

@@ -0,0 +1,97 @@
From 293fdf15ead45cd235e12a4f62f81767f7bce528 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:39 -0500
Subject: x86/mm: Use INVLPGB for kernel TLB flushes
Use broadcast TLB invalidation for kernel addresses when available.
Remove the need to send IPIs for kernel TLB flushes.
[ bp: Integrate dhansen's comments additions, merge the
flush_tlb_all() change into this one too. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-5-riel@surriel.com
---
arch/x86/mm/tlb.c | 48 +++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 44 insertions(+), 4 deletions(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1064,7 +1064,6 @@ void flush_tlb_mm_range(struct mm_struct
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
-
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -1074,7 +1073,32 @@ static void do_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+
+ /* First try (faster) hardware-assisted TLB invalidation. */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_flush_all();
+ else
+ /* Fall back to the IPI-based invalidation. */
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+/* Flush an arbitrarily large range of memory with INVLPGB. */
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
+{
+ unsigned long addr, nr;
+
+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
+ nr = (info->end - addr) >> PAGE_SHIFT;
+
+ /*
+ * INVLPGB has a limit on the size of ranges it can
+ * flush. Break up large flushes.
+ */
+ nr = clamp_val(nr, 1, invlpgb_count_max);
+
+ invlpgb_flush_addr_nosync(addr, nr);
+ }
+ __tlbsync();
}
static void do_kernel_range_flush(void *info)
@@ -1087,6 +1111,22 @@ static void do_kernel_range_flush(void *
flush_tlb_one_kernel(addr);
}
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_flush_all();
+ else
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ invlpgb_kernel_range_flush(info);
+ else
+ on_each_cpu(do_kernel_range_flush, info, 1);
+}
+
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
struct flush_tlb_info *info;
@@ -1097,9 +1137,9 @@ void flush_tlb_kernel_range(unsigned lon
TLB_GENERATION_INVALID);
if (info->end == TLB_FLUSH_ALL)
- on_each_cpu(do_flush_tlb_all, NULL, 1);
+ kernel_tlb_flush_all(info);
else
- on_each_cpu(do_kernel_range_flush, info, 1);
+ kernel_tlb_flush_range(info);
put_flush_tlb_info();
}

View File

@@ -1,23 +1,25 @@
From 5e5219596683c3b8178e09f6ec1e75154537325f Mon Sep 17 00:00:00 2001
From a093136bdb306345cd686f47c8faae8a608cfb47 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:27 -0500
Subject: x86/mm: use broadcast TLB flushing for page reclaim TLB flushing
Date: Tue, 25 Feb 2025 22:00:41 -0500
Subject: x86/mm: Use broadcast TLB flushing in page reclaim
In the page reclaim code, we only track the CPU(s) where the TLB needs
to be flushed, rather than all the individual mappings that may be getting
invalidated.
Page reclaim tracks only the CPU(s) where the TLB needs to be flushed, rather
than all the individual mappings that may be getting invalidated.
Use broadcast TLB flushing when that is available.
[ bp: Massage commit message. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-7-riel@surriel.com
---
arch/x86/mm/tlb.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1330,7 +1330,9 @@ void arch_tlbbatch_flush(struct arch_tlb
@@ -1320,7 +1320,9 @@ void arch_tlbbatch_flush(struct arch_tlb
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/

View File

@@ -0,0 +1,286 @@
From ef345ff96b47f21932c489edd2ebb44fbe3cb517 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:42 -0500
Subject: x86/mm: Add global ASID allocation helper functions
Add functions to manage global ASID space. Multithreaded processes that are
simultaneously active on 4 or more CPUs can get a global ASID, resulting in the
same PCID being used for that process on every CPU.
This in turn will allow the kernel to use hardware-assisted TLB flushing
through AMD INVLPGB or Intel RAR for these processes.
[ bp:
- Extend use_global_asid() comment
- s/X86_BROADCAST_TLB_FLUSH/BROADCAST_TLB_FLUSH/g
- other touchups ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-8-riel@surriel.com
---
arch/x86/include/asm/mmu.h | 12 +++
arch/x86/include/asm/mmu_context.h | 2 +
arch/x86/include/asm/tlbflush.h | 37 +++++++
arch/x86/mm/tlb.c | 154 ++++++++++++++++++++++++++++-
4 files changed, 202 insertions(+), 3 deletions(-)
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -69,6 +69,18 @@ typedef struct {
u16 pkey_allocation_map;
s16 execute_only_pkey;
#endif
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+ /*
+ * The global ASID will be a non-zero value when the process has
+ * the same ASID across all CPUs, allowing it to make use of
+ * hardware-assisted remote TLB invalidation like AMD INVLPGB.
+ */
+ u16 global_asid;
+
+ /* The process is transitioning to a new global ASID number. */
+ bool asid_transition;
+#endif
} mm_context_t;
#define INIT_MM_CONTEXT(mm) \
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+extern void mm_free_global_asid(struct mm_struct *mm);
+
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -6,6 +6,7 @@
#include <linux/mmu_notifier.h>
#include <linux/sched.h>
+#include <asm/barrier.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
@@ -234,6 +235,42 @@ void flush_tlb_one_kernel(unsigned long
void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
+static inline bool is_dyn_asid(u16 asid)
+{
+ return asid < TLB_NR_DYN_ASIDS;
+}
+
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+static inline u16 mm_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return 0;
+
+ asid = smp_load_acquire(&mm->context.global_asid);
+
+ /* mm->context.global_asid is either 0, or a global ASID */
+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
+
+ return asid;
+}
+
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
+{
+ /*
+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
+ * finish_asid_transition() needs to observe asid_transition = true
+ * once it observes global_asid.
+ */
+ mm->context.asid_transition = true;
+ smp_store_release(&mm->context.global_asid, asid);
+}
+#else
+static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+#endif /* CONFIG_BROADCAST_TLB_FLUSH */
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -74,13 +74,15 @@
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
- * the canonical identifier for an mm
+ * the canonical identifier for an mm, dynamically allocated on each CPU
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
+ * the canonical, global identifier for an mm, identical across all CPUs
*
- * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * kPCID - [1, MAX_ASID_AVAILABLE]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
@@ -252,6 +254,152 @@ static void choose_new_asid(struct mm_st
}
/*
+ * Global ASIDs are allocated for multi-threaded processes that are
+ * active on multiple CPUs simultaneously, giving each of those
+ * processes the same PCID on every CPU, for use with hardware-assisted
+ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
+ *
+ * These global ASIDs are held for the lifetime of the process.
+ */
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
+
+/*
+ * When the search for a free ASID in the global ASID space reaches
+ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
+ * freed global ASIDs are safe to re-use.
+ *
+ * This way the global flush only needs to happen at ASID rollover
+ * time, and not at ASID allocation time.
+ */
+static void reset_global_asid_space(void)
+{
+ lockdep_assert_held(&global_asid_lock);
+
+ invlpgb_flush_all_nonglobals();
+
+ /*
+ * The TLB flush above makes it safe to re-use the previously
+ * freed global ASIDs.
+ */
+ bitmap_andnot(global_asid_used, global_asid_used,
+ global_asid_freed, MAX_ASID_AVAILABLE);
+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
+
+ /* Restart the search from the start of global ASID space. */
+ last_global_asid = TLB_NR_DYN_ASIDS;
+}
+
+static u16 allocate_global_asid(void)
+{
+ u16 asid;
+
+ lockdep_assert_held(&global_asid_lock);
+
+ /* The previous allocation hit the edge of available address space */
+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
+ reset_global_asid_space();
+
+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
+
+ if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
+ /* This should never happen. */
+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
+ global_asid_available);
+ return 0;
+ }
+
+ /* Claim this global ASID. */
+ __set_bit(asid, global_asid_used);
+ last_global_asid = asid;
+ global_asid_available--;
+ return asid;
+}
+
+/*
+ * Check whether a process is currently active on more than @threshold CPUs.
+ * This is a cheap estimation on whether or not it may make sense to assign
+ * a global ASID to this process, and use broadcast TLB invalidation.
+ */
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
+{
+ int count = 0;
+ int cpu;
+
+ /* This quick check should eliminate most single threaded programs. */
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
+ return false;
+
+ /* Slower check to make sure. */
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /* Skip the CPUs that aren't really running this process. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
+ continue;
+
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ continue;
+
+ if (++count > threshold)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Assign a global ASID to the current process, protecting against
+ * races between multiple threads in the process.
+ */
+static void use_global_asid(struct mm_struct *mm)
+{
+ u16 asid;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* This process is already using broadcast TLB invalidation. */
+ if (mm_global_asid(mm))
+ return;
+
+ /*
+ * The last global ASID was consumed while waiting for the lock.
+ *
+ * If this fires, a more aggressive ASID reuse scheme might be
+ * needed.
+ */
+ if (!global_asid_available) {
+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
+ return;
+ }
+
+ asid = allocate_global_asid();
+ if (!asid)
+ return;
+
+ mm_assign_global_asid(mm, asid);
+}
+
+void mm_free_global_asid(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return;
+
+ if (!mm_global_asid(mm))
+ return;
+
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
+
+ /* The global ASID can be re-used only after flush at wrap-around. */
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
+ __set_bit(mm->context.global_asid, global_asid_freed);
+
+ mm->context.global_asid = 0;
+ global_asid_available++;
+#endif
+}
+
+/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*

View File

@@ -0,0 +1,215 @@
From b3eb743c32515bf8fca7b619dd2a2c64b5812dd8 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:43 -0500
Subject: x86/mm: Handle global ASID context switch and TLB flush
Do context switch and TLB flush support for processes that use a global
ASID and PCID across all CPUs.
At both context switch time and TLB flush time, it needs to be checked whether
a task is switching to a global ASID, and, if so, reload the TLB with the new
ASID as appropriate.
In both code paths, the TLB flush is avoided if a global ASID is used, because
the global ASIDs are always kept up to date across CPUs, even when the
process is not running on a CPU.
[ bp:
- Massage
- :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-9-riel@surriel.com
---
arch/x86/include/asm/tlbflush.h | 14 ++++++
arch/x86/mm/tlb.c | 77 ++++++++++++++++++++++++++++++---
2 files changed, 84 insertions(+), 7 deletions(-)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -240,6 +240,11 @@ static inline bool is_dyn_asid(u16 asid)
return asid < TLB_NR_DYN_ASIDS;
}
+static inline bool is_global_asid(u16 asid)
+{
+ return !is_dyn_asid(asid);
+}
+
#ifdef CONFIG_BROADCAST_TLB_FLUSH
static inline u16 mm_global_asid(struct mm_struct *mm)
{
@@ -266,9 +271,18 @@ static inline void mm_assign_global_asid
mm->context.asid_transition = true;
smp_store_release(&mm->context.global_asid, asid);
}
+
+static inline bool mm_in_asid_transition(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ return mm && READ_ONCE(mm->context.asid_transition);
+}
#else
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
#ifdef CONFIG_PARAVIRT
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -227,6 +227,20 @@ static void choose_new_asid(struct mm_st
return;
}
+ /*
+ * TLB consistency for global ASIDs is maintained with hardware assisted
+ * remote TLB flushing. Global ASIDs are always up to date.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ u16 global_asid = mm_global_asid(next);
+
+ if (global_asid) {
+ *new_asid = global_asid;
+ *need_flush = false;
+ return;
+ }
+ }
+
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
@@ -400,6 +414,23 @@ void mm_free_global_asid(struct mm_struc
}
/*
+ * Is the mm transitioning from a CPU-local ASID to a global ASID?
+ */
+static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
+{
+ u16 global_asid = mm_global_asid(mm);
+
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return false;
+
+ /* Process is transitioning to a global ASID */
+ if (global_asid && asid != global_asid)
+ return true;
+
+ return false;
+}
+
+/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
@@ -704,7 +735,8 @@ void switch_mm_irqs_off(struct mm_struct
*/
if (prev == next) {
/* Not actually switching mm's */
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ VM_WARN_ON(is_dyn_asid(prev_asid) &&
+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
/*
@@ -721,6 +753,20 @@ void switch_mm_irqs_off(struct mm_struct
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
+ /* Check if the current mm is transitioning to a global ASID */
+ if (mm_needs_global_asid(next, prev_asid)) {
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ goto reload_tlb;
+ }
+
+ /*
+ * Broadcast TLB invalidation keeps this ASID up to date
+ * all the time.
+ */
+ if (is_global_asid(prev_asid))
+ return;
+
/*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
@@ -755,6 +801,13 @@ void switch_mm_irqs_off(struct mm_struct
cond_mitigation(tsk);
/*
+ * Let nmi_uaccess_okay() and finish_asid_transition()
+ * know that CR3 is changing.
+ */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+
+ /*
* Leave this CPU in prev's mm_cpumask. Atomic writes to
* mm_cpumask can be expensive under contention. The CPU
* will be removed lazily at TLB flush time.
@@ -768,14 +821,12 @@ void switch_mm_irqs_off(struct mm_struct
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
-
- /* Let nmi_uaccess_okay() know that we're changing CR3. */
- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
- barrier();
}
+reload_tlb:
new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
+ VM_WARN_ON_ONCE(is_global_asid(new_asid));
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
@@ -894,7 +945,7 @@ static void flush_tlb_func(void *info)
const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ u64 local_tlb_gen;
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
u64 mm_tlb_gen;
@@ -917,6 +968,16 @@ static void flush_tlb_func(void *info)
if (unlikely(loaded_mm == &init_mm))
return;
+ /* Reload the ASID if transitioning into or out of a global ASID */
+ if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ }
+
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
+ if (is_global_asid(loaded_mm_asid))
+ return;
+
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
@@ -934,6 +995,8 @@ static void flush_tlb_func(void *info)
return;
}
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
f->new_tlb_gen <= local_tlb_gen)) {
/*
@@ -1101,7 +1164,7 @@ STATIC_NOPV void native_flush_tlb_multi(
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables)
+ if (info->freed_tables || mm_in_asid_transition(info->mm))
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,

View File

@@ -0,0 +1,88 @@
From c63f1d0a496de7a926b92b52061905edfc8428a4 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:44 -0500
Subject: x86/mm: Add global ASID process exit helpers
A global ASID is allocated for the lifetime of a process. Free the global ASID
at process exit time.
[ bp: Massage, create helpers, hide details inside them. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-10-riel@surriel.com
---
arch/x86/include/asm/mmu_context.h | 8 +++++++-
arch/x86/include/asm/tlbflush.h | 9 +++++++++
2 files changed, 16 insertions(+), 1 deletion(-)
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -2,7 +2,6 @@
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H
-#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>
@@ -13,6 +12,7 @@
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/gsseg.h>
+#include <asm/desc.h>
extern atomic64_t last_mm_ctx_id;
@@ -139,6 +139,9 @@ static inline void mm_reset_untag_mask(s
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+#define mm_init_global_asid mm_init_global_asid
+extern void mm_init_global_asid(struct mm_struct *mm);
+
extern void mm_free_global_asid(struct mm_struct *mm);
/*
@@ -163,6 +166,8 @@ static inline int init_new_context(struc
mm->context.execute_only_pkey = -1;
}
#endif
+
+ mm_init_global_asid(mm);
mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
@@ -172,6 +177,7 @@ static inline int init_new_context(struc
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
+ mm_free_global_asid(mm);
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,14 @@ static inline u16 mm_global_asid(struct
return asid;
}
+static inline void mm_init_global_asid(struct mm_struct *mm)
+{
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ mm->context.global_asid = 0;
+ mm->context.asid_transition = false;
+ }
+}
+
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
{
/*
@@ -281,6 +289,7 @@ static inline bool mm_in_asid_transition
}
#else
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
+static inline void mm_init_global_asid(struct mm_struct *mm) { }
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
#endif /* CONFIG_BROADCAST_TLB_FLUSH */

View File

@@ -0,0 +1,219 @@
From e16bb18388207841efa841b9b11e69c886817024 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 25 Feb 2025 22:00:45 -0500
Subject: x86/mm: Enable broadcast TLB invalidation for multi-threaded
processes
There is not enough room in the 12-bit ASID address space to hand out
broadcast ASIDs to every process. Only hand out broadcast ASIDs to processes
when they are observed to be simultaneously running on 4 or more CPUs.
This also allows single threaded process to continue using the cheaper, local
TLB invalidation instructions like INVLPGB.
Due to the structure of flush_tlb_mm_range(), the INVLPGB flushing is done in
a generically named broadcast_tlb_flush() function which can later also be
used for Intel RAR.
Combined with the removal of unnecessary lru_add_drain calls() (see
https://lore.kernel.org/r/20241219153253.3da9e8aa@fangorn) this results in
a nice performance boost for the will-it-scale tlb_flush2_threads test on an
AMD Milan system with 36 cores:
- vanilla kernel: 527k loops/second
- lru_add_drain removal: 731k loops/second
- only INVLPGB: 527k loops/second
- lru_add_drain + INVLPGB: 1157k loops/second
Profiling with only the INVLPGB changes showed while TLB invalidation went
down from 40% of the total CPU time to only around 4% of CPU time, the
contention simply moved to the LRU lock.
Fixing both at the same time about doubles the number of iterations per second
from this case.
Comparing will-it-scale tlb_flush2_threads with several different numbers of
threads on a 72 CPU AMD Milan shows similar results. The number represents the
total number of loops per second across all the threads:
threads tip INVLPGB
1 315k 304k
2 423k 424k
4 644k 1032k
8 652k 1267k
16 737k 1368k
32 759k 1199k
64 636k 1094k
72 609k 993k
1 and 2 thread performance is similar with and without INVLPGB, because
INVLPGB is only used on processes using 4 or more CPUs simultaneously.
The number is the median across 5 runs.
Some numbers closer to real world performance can be found at Phoronix, thanks
to Michael:
https://www.phoronix.com/news/AMD-INVLPGB-Linux-Benefits
[ bp:
- Massage
- :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
- :%s/\<clear_asid_transition\>/mm_clear_asid_transition/cgi
- Fold in a 0day bot fix: https://lore.kernel.org/oe-kbuild-all/202503040000.GtiWUsBm-lkp@intel.com
]
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Link: https://lore.kernel.org/r/20250226030129.530345-11-riel@surriel.com
---
arch/x86/include/asm/tlbflush.h | 6 ++
arch/x86/mm/tlb.c | 104 +++++++++++++++++++++++++++++++-
2 files changed, 109 insertions(+), 1 deletion(-)
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -280,6 +280,11 @@ static inline void mm_assign_global_asid
smp_store_release(&mm->context.global_asid, asid);
}
+static inline void mm_clear_asid_transition(struct mm_struct *mm)
+{
+ WRITE_ONCE(mm->context.asid_transition, false);
+}
+
static inline bool mm_in_asid_transition(struct mm_struct *mm)
{
if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
@@ -291,6 +296,7 @@ static inline bool mm_in_asid_transition
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
static inline void mm_init_global_asid(struct mm_struct *mm) { }
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
+static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -431,6 +431,105 @@ static bool mm_needs_global_asid(struct
}
/*
+ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
+ * systems have over 8k CPUs. Because of this potential ASID shortage,
+ * global ASIDs are handed out to processes that have frequent TLB
+ * flushes and are active on 4 or more CPUs simultaneously.
+ */
+static void consider_global_asid(struct mm_struct *mm)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
+ return;
+
+ /* Check every once in a while. */
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
+ return;
+
+ /*
+ * Assign a global ASID if the process is active on
+ * 4 or more CPUs simultaneously.
+ */
+ if (mm_active_cpus_exceeds(mm, 3))
+ use_global_asid(mm);
+}
+
+static void finish_asid_transition(struct flush_tlb_info *info)
+{
+ struct mm_struct *mm = info->mm;
+ int bc_asid = mm_global_asid(mm);
+ int cpu;
+
+ if (!mm_in_asid_transition(mm))
+ return;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ /*
+ * The remote CPU is context switching. Wait for that to
+ * finish, to catch the unlikely case of it switching to
+ * the target mm with an out of date ASID.
+ */
+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
+ cpu_relax();
+
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
+ continue;
+
+ /*
+ * If at least one CPU is not using the global ASID yet,
+ * send a TLB flush IPI. The IPI should cause stragglers
+ * to transition soon.
+ *
+ * This can race with the CPU switching to another task;
+ * that results in a (harmless) extra IPI.
+ */
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
+ flush_tlb_multi(mm_cpumask(info->mm), info);
+ return;
+ }
+ }
+
+ /* All the CPUs running this process are using the global ASID. */
+ mm_clear_asid_transition(mm);
+}
+
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
+{
+ bool pmd = info->stride_shift == PMD_SHIFT;
+ unsigned long asid = mm_global_asid(info->mm);
+ unsigned long addr = info->start;
+
+ /*
+ * TLB flushes with INVLPGB are kicked off asynchronously.
+ * The inc_mm_tlb_gen() guarantees page table updates are done
+ * before these TLB flushes happen.
+ */
+ if (info->end == TLB_FLUSH_ALL) {
+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
+ /* Do any CPUs supporting INVLPGB need PTI? */
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ invlpgb_flush_single_pcid_nosync(user_pcid(asid));
+ } else do {
+ unsigned long nr = 1;
+
+ if (info->stride_shift <= PMD_SHIFT) {
+ nr = (info->end - addr) >> info->stride_shift;
+ nr = clamp_val(nr, 1, invlpgb_count_max);
+ }
+
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
+
+ addr += nr << info->stride_shift;
+ } while (addr < info->end);
+
+ finish_asid_transition(info);
+
+ /* Wait for the INVLPGBs kicked off above to finish. */
+ __tlbsync();
+}
+
+/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
@@ -1260,9 +1359,12 @@ void flush_tlb_mm_range(struct mm_struct
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ if (mm_global_asid(mm)) {
+ broadcast_tlb_flush(info);
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
+ consider_global_asid(mm);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();

View File

@@ -1,28 +1,31 @@
From 101ba03a6474bbc52971505abf1e3ee9613f255b Mon Sep 17 00:00:00 2001
From 9c88454149bd22cc3d8618b4445d32aaf48cadce Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 5 Feb 2025 23:43:30 -0500
Subject: x86/mm: enable AMD translation cache extensions
Date: Tue, 25 Feb 2025 22:00:47 -0500
Subject: x86/mm: Enable AMD translation cache extensions
With AMD TCE (translation cache extensions) only the intermediate mappings
that cover the address range zapped by INVLPG / INVLPGB get invalidated,
rather than all intermediate mappings getting zapped at every TLB invalidation.
This can help reduce the TLB miss rate, by keeping more intermediate
mappings in the cache.
This can help reduce the TLB miss rate, by keeping more intermediate mappings
in the cache.
From the AMD manual:
Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit
to 1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on
TLB entries. When this bit is 0, these instructions remove the target PTE
from the TLB as well as all upper-level table entries that are cached
in the TLB, whether or not they are associated with the target PTE.
When this bit is set, these instructions will remove the target PTE and
only those upper-level entries that lead to the target PTE in
the page table hierarchy, leaving unrelated upper-level entries intact.
Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit to
1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on TLB
entries. When this bit is 0, these instructions remove the target PTE from the
TLB as well as all upper-level table entries that are cached in the TLB,
whether or not they are associated with the target PTE. When this bit is set,
these instructions will remove the target PTE and only those upper-level
entries that lead to the target PTE in the page table hierarchy, leaving
unrelated upper-level entries intact.
[ bp: use cpu_has()... I know, it is a mess. ]
Signed-off-by: Rik van Riel <riel@surriel.com>
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250226030129.530345-13-riel@surriel.com
---
arch/x86/include/asm/msr-index.h | 2 ++
arch/x86/kernel/cpu/amd.c | 4 ++++
@@ -49,13 +52,13 @@ Tested-by: Manali Shukla <Manali.Shukla@amd.com>
/*
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1071,6 +1071,10 @@ static void init_amd(struct cpuinfo_x86
@@ -1075,6 +1075,10 @@ static void init_amd(struct cpuinfo_x86
/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+ /* Enable Translation Cache Extension */
+ if (cpu_feature_enabled(X86_FEATURE_TCE))
+ if (cpu_has(c, X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}

View File

@@ -0,0 +1,121 @@
From 20dfd0edb14a1d0aecd5eb227f2db64487201976 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 4 Mar 2025 12:59:56 +0100
Subject: x86/mm: Always set the ASID valid bit for the INVLPGB instruction
When executing the INVLPGB instruction on a bare-metal host or hypervisor, if
the ASID valid bit is not set, the instruction will flush the TLB entries that
match the specified criteria for any ASID, not just the those of the host. If
virtual machines are running on the system, this may result in inadvertent
flushes of guest TLB entries.
When executing the INVLPGB instruction in a guest and the INVLPGB instruction is
not intercepted by the hypervisor, the hardware will replace the requested ASID
with the guest ASID and set the ASID valid bit before doing the broadcast
invalidation. Thus a guest is only able to flush its own TLB entries.
So to limit the host TLB flushing reach, always set the ASID valid bit using an
ASID value of 0 (which represents the host/hypervisor). This will will result in
the desired effect in both host and guest.
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20250304120449.GHZ8bsYYyEBOKQIxBm@fat_crate.local
---
arch/x86/include/asm/tlb.h | 58 +++++++++++++++++++++-----------------
1 file changed, 32 insertions(+), 26 deletions(-)
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -33,6 +33,27 @@ enum addr_stride {
PMD_STRIDE = 1
};
+/*
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
+ * of the three. For example:
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
+ *
+ * The first is used to invalidate (kernel) mappings at a particular
+ * address across all processes.
+ *
+ * The latter invalidates all TLB entries matching a PCID.
+ */
+#define INVLPGB_FLAG_VA BIT(0)
+#define INVLPGB_FLAG_PCID BIT(1)
+#define INVLPGB_FLAG_ASID BIT(2)
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
+
+/* The implied mode when all bits are clear: */
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
+
#ifdef CONFIG_BROADCAST_TLB_FLUSH
/*
* INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
@@ -40,14 +61,20 @@ enum addr_stride {
* The INVLPGB instruction is weakly ordered, and a batch of invalidations can
* be done in a parallel fashion.
*
- * The instruction takes the number of extra pages to invalidate, beyond
- * the first page, while __invlpgb gets the more human readable number of
- * pages to invalidate.
+ * The instruction takes the number of extra pages to invalidate, beyond the
+ * first page, while __invlpgb gets the more human readable number of pages to
+ * invalidate.
*
* The bits in rax[0:2] determine respectively which components of the address
* (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
* address in the specified range matches.
*
+ * Since it is desired to only flush TLB entries for the ASID that is executing
+ * the instruction (a host/hypervisor or a guest), the ASID valid bit should
+ * always be set. On a host/hypervisor, the hardware will use the ASID value
+ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will
+ * use the actual ASID value of the guest.
+ *
* TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
* this CPU have completed.
*/
@@ -55,9 +82,9 @@ static inline void __invlpgb(unsigned lo
unsigned long addr, u16 nr_pages,
enum addr_stride stride, u8 flags)
{
- u32 edx = (pcid << 16) | asid;
+ u64 rax = addr | flags | INVLPGB_FLAG_ASID;
u32 ecx = (stride << 31) | (nr_pages - 1);
- u64 rax = addr | flags;
+ u32 edx = (pcid << 16) | asid;
/* The low bits in rax are for flags. Verify addr is clean. */
VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
@@ -93,27 +120,6 @@ static inline void __invlpgb_all(unsigne
static inline void __tlbsync(void) { }
#endif
-/*
- * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
- * of the three. For example:
- * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
- * - FLAG_PCID: invalidate all TLB entries matching the PCID
- *
- * The first is used to invalidate (kernel) mappings at a particular
- * address across all processes.
- *
- * The latter invalidates all TLB entries matching a PCID.
- */
-#define INVLPGB_FLAG_VA BIT(0)
-#define INVLPGB_FLAG_PCID BIT(1)
-#define INVLPGB_FLAG_ASID BIT(2)
-#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
-#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
-#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
-
-/* The implied mode when all bits are clear: */
-#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
-
static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
unsigned long addr,
u16 nr, bool stride)

View File

@@ -0,0 +1,70 @@
From b5a210ad153e5448876c422f5c77d3dcd83abac6 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Wed, 19 Mar 2025 13:25:20 -0400
Subject: x86/mm: Only do broadcast flush from reclaim if pages were unmapped
Track whether pages were unmapped from any MM (even ones with a currently
empty mm_cpumask) by the reclaim code, to figure out whether or not
broadcast TLB flush should be done when reclaim finishes.
The reason any MM must be tracked, and not only ones contributing to the
tlbbatch cpumask, is that broadcast ASIDs are expected to be kept up to
date even on CPUs where the MM is not currently active.
This change allows reclaim to avoid doing TLB flushes when only clean page
cache pages and/or slab memory were reclaimed, which is fairly common.
( This is a simpler alternative to the code that was in my INVLPGB series
before, and it seems to capture most of the benefit due to how common
it is to reclaim only page cache. )
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20250319132520.6b10ad90@fangorn
---
arch/x86/include/asm/tlbbatch.h | 5 +++++
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 3 ++-
3 files changed, 8 insertions(+), 1 deletion(-)
--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -10,6 +10,11 @@ struct arch_tlbflush_unmap_batch {
* the PFNs being flushed..
*/
struct cpumask cpumask;
+ /*
+ * Set if pages were unmapped from any MM, even one that does not
+ * have active CPUs in its cpumask.
+ */
+ bool unmapped_pages;
};
#endif /* _ARCH_X86_TLBBATCH_H */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -353,6 +353,7 @@ static inline void arch_tlbbatch_add_pen
{
inc_mm_tlb_gen(mm);
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ batch->unmapped_pages = true;
mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1633,8 +1633,9 @@ void arch_tlbbatch_flush(struct arch_tlb
* a local TLB flush is needed. Optimize this use-case by calling
* flush_tlb_func_local() directly in this case.
*/
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
invlpgb_flush_all_nonglobals();
+ batch->unmapped_pages = false;
} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {

View File

@@ -0,0 +1,398 @@
From 4ad0ae3b81cd90c0729df9ac5f1ff21f4dad6130 Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@natalenko.name>
Date: Mon, 30 Sep 2024 08:58:38 +0200
Subject: mm: expose per-process KSM control via syscalls
d7597f59d1d3 added a new API to enable per-process KSM control. It
however uses prctl, which doesn't allow controlling KSM from outside of
the current process.
Hence, expose this API via 3 syscalls: process_ksm_enable,
process_ksm_disable and process_ksm_status. Given sufficient privileges,
auto-KSM can be enable by another process.
Since these syscalls are not in the upstream kernel, also expose their
numbers under /sys/kernel/process_ksm so that userspace tooling like
uksmd knows how to use them.
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
---
arch/alpha/kernel/syscalls/syscall.tbl | 3 +
arch/arm/tools/syscall.tbl | 3 +
arch/m68k/kernel/syscalls/syscall.tbl | 3 +
arch/microblaze/kernel/syscalls/syscall.tbl | 3 +
arch/mips/kernel/syscalls/syscall_n32.tbl | 3 +
arch/mips/kernel/syscalls/syscall_n64.tbl | 3 +
arch/mips/kernel/syscalls/syscall_o32.tbl | 3 +
arch/parisc/kernel/syscalls/syscall.tbl | 3 +
arch/powerpc/kernel/syscalls/syscall.tbl | 3 +
arch/s390/kernel/syscalls/syscall.tbl | 3 +
arch/sh/kernel/syscalls/syscall.tbl | 3 +
arch/sparc/kernel/syscalls/syscall.tbl | 3 +
arch/x86/entry/syscalls/syscall_32.tbl | 3 +
arch/x86/entry/syscalls/syscall_64.tbl | 3 +
arch/xtensa/kernel/syscalls/syscall.tbl | 3 +
include/linux/syscalls.h | 3 +
include/uapi/asm-generic/unistd.h | 9 +-
kernel/sys.c | 138 ++++++++++++++++++
kernel/sys_ni.c | 3 +
scripts/syscall.tbl | 3 +
.../arch/powerpc/entry/syscalls/syscall.tbl | 3 +
.../perf/arch/s390/entry/syscalls/syscall.tbl | 3 +
22 files changed, 206 insertions(+), 1 deletion(-)
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -506,3 +506,6 @@
574 common getxattrat sys_getxattrat
575 common listxattrat sys_listxattrat
576 common removexattrat sys_removexattrat
+577 common process_ksm_enable sys_process_ksm_enable
+578 common process_ksm_disable sys_process_ksm_disable
+579 common process_ksm_status sys_process_ksm_status
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -481,3 +481,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -466,3 +466,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -472,3 +472,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -405,3 +405,6 @@
464 n32 getxattrat sys_getxattrat
465 n32 listxattrat sys_listxattrat
466 n32 removexattrat sys_removexattrat
+467 n32 process_ksm_enable sys_process_ksm_enable
+468 n32 process_ksm_disable sys_process_ksm_disable
+469 n32 process_ksm_status sys_process_ksm_status
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -381,3 +381,6 @@
464 n64 getxattrat sys_getxattrat
465 n64 listxattrat sys_listxattrat
466 n64 removexattrat sys_removexattrat
+467 n64 process_ksm_enable sys_process_ksm_enable
+468 n64 process_ksm_disable sys_process_ksm_disable
+469 n64 process_ksm_status sys_process_ksm_status
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -454,3 +454,6 @@
464 o32 getxattrat sys_getxattrat
465 o32 listxattrat sys_listxattrat
466 o32 removexattrat sys_removexattrat
+467 o32 process_ksm_enable sys_process_ksm_enable
+468 o32 process_ksm_disable sys_process_ksm_disable
+469 o32 process_ksm_status sys_process_ksm_status
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -465,3 +465,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -557,3 +557,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -469,3 +469,6 @@
464 common getxattrat sys_getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status sys_process_ksm_status
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -470,3 +470,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -512,3 +512,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -472,3 +472,6 @@
464 i386 getxattrat sys_getxattrat
465 i386 listxattrat sys_listxattrat
466 i386 removexattrat sys_removexattrat
+467 i386 process_ksm_enable sys_process_ksm_enable
+468 i386 process_ksm_disable sys_process_ksm_disable
+469 i386 process_ksm_status sys_process_ksm_status
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -390,6 +390,9 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
#
# Due to a historical design error, certain syscalls are numbered differently
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -437,3 +437,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -831,6 +831,9 @@ asmlinkage long sys_madvise(unsigned lon
asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
size_t vlen, int behavior, unsigned int flags);
asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
+asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags);
+asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags);
+asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags);
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
unsigned long prot, unsigned long pgoff,
unsigned long flags);
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -850,8 +850,15 @@ __SYSCALL(__NR_listxattrat, sys_listxatt
#define __NR_removexattrat 466
__SYSCALL(__NR_removexattrat, sys_removexattrat)
+#define __NR_process_ksm_enable 467
+__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
+#define __NR_process_ksm_disable 468
+__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
+#define __NR_process_ksm_status 469
+__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
+
#undef __NR_syscalls
-#define __NR_syscalls 467
+#define __NR_syscalls 470
/*
* 32 bit systems traditionally used different
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2819,6 +2819,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
return error;
}
+#ifdef CONFIG_KSM
+enum pkc_action {
+ PKSM_ENABLE = 0,
+ PKSM_DISABLE,
+ PKSM_STATUS,
+};
+
+static long do_process_ksm_control(int pidfd, enum pkc_action action)
+{
+ long ret;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ unsigned int f_flags;
+
+ task = pidfd_get_task(pidfd, &f_flags);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto out;
+ }
+
+ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto release_task;
+ }
+
+ /* Require CAP_SYS_NICE for influencing process performance. */
+ if (!capable(CAP_SYS_NICE)) {
+ ret = -EPERM;
+ goto release_mm;
+ }
+
+ if (mmap_write_lock_killable(mm)) {
+ ret = -EINTR;
+ goto release_mm;
+ }
+
+ switch (action) {
+ case PKSM_ENABLE:
+ ret = ksm_enable_merge_any(mm);
+ break;
+ case PKSM_DISABLE:
+ ret = ksm_disable_merge_any(mm);
+ break;
+ case PKSM_STATUS:
+ ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ break;
+ }
+
+ mmap_write_unlock(mm);
+
+release_mm:
+ mmput(mm);
+release_task:
+ put_task_struct(task);
+out:
+ return ret;
+}
+#endif /* CONFIG_KSM */
+
+SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+ if (flags != 0)
+ return -EINVAL;
+
+ return do_process_ksm_control(pidfd, PKSM_ENABLE);
+#else /* CONFIG_KSM */
+ return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+ if (flags != 0)
+ return -EINVAL;
+
+ return do_process_ksm_control(pidfd, PKSM_DISABLE);
+#else /* CONFIG_KSM */
+ return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+ if (flags != 0)
+ return -EINVAL;
+
+ return do_process_ksm_control(pidfd, PKSM_STATUS);
+#else /* CONFIG_KSM */
+ return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+#ifdef CONFIG_KSM
+static ssize_t process_ksm_enable_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", __NR_process_ksm_enable);
+}
+static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable);
+
+static ssize_t process_ksm_disable_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", __NR_process_ksm_disable);
+}
+static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable);
+
+static ssize_t process_ksm_status_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", __NR_process_ksm_status);
+}
+static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status);
+
+static struct attribute *process_ksm_sysfs_attrs[] = {
+ &process_ksm_enable_attr.attr,
+ &process_ksm_disable_attr.attr,
+ &process_ksm_status_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group process_ksm_sysfs_attr_group = {
+ .attrs = process_ksm_sysfs_attrs,
+ .name = "process_ksm",
+};
+
+static int __init process_ksm_sysfs_init(void)
+{
+ return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group);
+}
+subsys_initcall(process_ksm_sysfs_init);
+#endif /* CONFIG_KSM */
+
SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
struct getcpu_cache __user *, unused)
{
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,6 +186,9 @@ COND_SYSCALL(mincore);
COND_SYSCALL(madvise);
COND_SYSCALL(process_madvise);
COND_SYSCALL(process_mrelease);
+COND_SYSCALL(process_ksm_enable);
+COND_SYSCALL(process_ksm_disable);
+COND_SYSCALL(process_ksm_status);
COND_SYSCALL(remap_file_pages);
COND_SYSCALL(mbind);
COND_SYSCALL(get_mempolicy);
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -407,3 +407,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -557,3 +557,6 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -469,3 +469,6 @@
464 common getxattrat sys_getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat sys_removexattrat
+467 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable
+468 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable
+469 common process_ksm_status sys_process_ksm_status sys_process_ksm_status

View File

@@ -1,4 +1,4 @@
From 95490afcba944883e7f911214391a1a1e2fa3261 Mon Sep 17 00:00:00 2001
From 6d141e3121676e9ca50d6465a622b9a5d572219a Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Mon, 26 Apr 2021 22:12:46 +0200
Subject: ZEN: Add VHBA driver
@@ -10,8 +10,8 @@ tag vhba-module-20240917
drivers/scsi/Makefile | 1 +
drivers/scsi/vhba/Kconfig | 9 +
drivers/scsi/vhba/Makefile | 4 +
drivers/scsi/vhba/vhba.c | 1124 ++++++++++++++++++++++++++++++++++++
5 files changed, 1140 insertions(+)
drivers/scsi/vhba/vhba.c | 1130 ++++++++++++++++++++++++++++++++++++
5 files changed, 1146 insertions(+)
create mode 100644 drivers/scsi/vhba/Kconfig
create mode 100644 drivers/scsi/vhba/Makefile
create mode 100644 drivers/scsi/vhba/vhba.c
@@ -56,7 +56,7 @@ tag vhba-module-20240917
+ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror
--- /dev/null
+++ b/drivers/scsi/vhba/vhba.c
@@ -0,0 +1,1124 @@
@@ -0,0 +1,1130 @@
+/*
+ * vhba.c
+ *
@@ -1108,7 +1108,11 @@ tag vhba-module-20240917
+ return 0;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
+static int vhba_remove (struct platform_device *pdev)
+#else
+static void vhba_remove (struct platform_device *pdev)
+#endif
+{
+ struct vhba_host *vhost;
+ struct Scsi_Host *shost;
@@ -1121,7 +1125,9 @@ tag vhba-module-20240917
+
+ kfree(vhost->commands);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
+ return 0;
+#endif
+}
+
+static void vhba_release (struct device * dev)

View File

@@ -0,0 +1,28 @@
From 1f9910c9a54b424ad0cd415b981986937618c4ec Mon Sep 17 00:00:00 2001
From: Rok Mandeljc <rok.mandeljc@gmail.com>
Date: Mon, 3 Feb 2025 21:05:32 +0100
Subject: VHBA: fix building with kernel 6.14-rc1
Kernel 6.14-rc1 simplified the selection of tag allocation policy.
Instead of enum-based value, a boolean is used, and the corresponding
field in the `scsi_host_template` structure was renamed from
`tag_alloc_policy` to `tag_alloc_policy_rr`.
See: https://github.com/torvalds/linux/commit/ce32496
---
drivers/scsi/vhba/vhba.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/drivers/scsi/vhba/vhba.c
+++ b/drivers/scsi/vhba/vhba.c
@@ -537,7 +537,9 @@ static struct scsi_host_template vhba_te
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
.slave_alloc = vhba_slave_alloc,
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 14, 0)
+ .tag_alloc_policy_rr = true,
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
.tag_alloc_policy = BLK_TAG_ALLOC_RR,
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)

View File

@@ -1,35 +0,0 @@
From 8a6a60b5a71d7f85351a9350eb651c4ce15b8f00 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Sun, 15 Sep 2024 19:05:46 +0000
Subject: vhba: Fix compat with kernel 6.11
Upstream commit 0edb555a65d1ef047a9805051c36922b52a38a9d changed the
return value of the `remove` callback from `int` to `void`.
---
drivers/scsi/vhba/vhba.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/drivers/scsi/vhba/vhba.c
+++ b/drivers/scsi/vhba/vhba.c
@@ -1049,7 +1049,11 @@ static int vhba_probe (struct platform_d
return 0;
}
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
static int vhba_remove (struct platform_device *pdev)
+#else
+static void vhba_remove (struct platform_device *pdev)
+#endif
{
struct vhba_host *vhost;
struct Scsi_Host *shost;
@@ -1062,7 +1066,9 @@ static int vhba_remove (struct platform_
kfree(vhost->commands);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
return 0;
+#endif
}
static void vhba_release (struct device * dev)

View File

@@ -1,4 +1,4 @@
From 1cdff301de6db901bc2bfd7ce78016d9b824d667 Mon Sep 17 00:00:00 2001
From 02b4d790bb05e24e7408a147f33e4e9ca0b805fa Mon Sep 17 00:00:00 2001
From: Daniel Drake <drake@endlessm.com>
Date: Tue, 4 Jun 2019 14:51:21 +0800
Subject: ZEN: PCI: Add Intel remapped NVMe device support
@@ -135,8 +135,8 @@ Contains:
}
static int ahci_get_irq_vector(struct ata_host *host, int port)
@@ -1896,7 +1889,9 @@ static int ahci_init_one(struct pci_dev
hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
@@ -1898,7 +1891,9 @@ static int ahci_init_one(struct pci_dev
return -ENOMEM;
/* detect remapped nvme devices */
- ahci_remap_check(pdev, ahci_pci_bar, hpriv);

View File

@@ -1,4 +1,4 @@
From 87b0cab8d8701db7754e5778b93ff83ffc64c7ae Mon Sep 17 00:00:00 2001
From 17190525fdc9c9f73fe22832ab0631e9e1bbad6d Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Sun, 8 Mar 2020 00:31:35 -0800
Subject: ZEN: Disable stack conservation for GCC
@@ -15,7 +15,7 @@ Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
--- a/Makefile
+++ b/Makefile
@@ -1026,11 +1026,6 @@ KBUILD_CFLAGS += -fno-strict-overflow
@@ -1078,11 +1078,6 @@ KBUILD_CFLAGS += -fno-strict-overflow
# Make sure -fstack-check isn't enabled (like gentoo apparently did)
KBUILD_CFLAGS += -fno-stack-check
@@ -24,6 +24,6 @@ Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
-KBUILD_CFLAGS += -fconserve-stack
-endif
-
# change __FILE__ to the relative path from the srctree
KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
# change __FILE__ to the relative path to the source directory
ifdef building_out_of_srctree
KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srcroot)/=)

View File

@@ -1,43 +0,0 @@
From 48d2ea8801ccf8bd9cd48c12fce79040bbcae363 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 2 Jun 2016 23:36:32 -0500
Subject: ZEN: Initialize ata before graphics
ATA init is the long pole in the boot process, and its asynchronous.
move the graphics init after it so that ata and graphics initialize
in parallel
---
drivers/Makefile | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -64,14 +64,8 @@ obj-y += char/
# iommu/ comes before gpu as gpu are using iommu controllers
obj-y += iommu/
-# gpu/ comes after char for AGP vs DRM startup and after iommu
-obj-y += gpu/
-
obj-$(CONFIG_CONNECTOR) += connector/
-# i810fb depends on char/agp/
-obj-$(CONFIG_FB_I810) += video/fbdev/i810/
-
obj-$(CONFIG_PARPORT) += parport/
obj-y += base/ block/ misc/ mfd/ nfc/
obj-$(CONFIG_LIBNVDIMM) += nvdimm/
@@ -83,6 +77,13 @@ obj-y += macintosh/
obj-y += scsi/
obj-y += nvme/
obj-$(CONFIG_ATA) += ata/
+
+# gpu/ comes after char for AGP vs DRM startup and after iommu
+obj-y += gpu/
+
+# i810fb depends on char/agp/
+obj-$(CONFIG_FB_I810) += video/fbdev/i810/
+
obj-$(CONFIG_TARGET_CORE) += target/
obj-$(CONFIG_MTD) += mtd/
obj-$(CONFIG_SPI) += spi/

View File

@@ -1,4 +1,4 @@
From 2f3e9fbc48151e4499f9cbd810d9467ac34b0a3b Mon Sep 17 00:00:00 2001
From 2b801ae725ae05be994d374efdce8fc2e828687f Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Sun, 27 Dec 2020 14:43:13 +0000
Subject: ZEN: Input: evdev - use call_rcu when detaching client

View File

@@ -1,4 +1,4 @@
From 51026b78d015797e216aadc4e80158181c2c2bb4 Mon Sep 17 00:00:00 2001
From 3777b5340ebf0460e6fb79205b294dd4333c9d8b Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Mon, 11 Jul 2022 19:10:30 -0500
Subject: ZEN: cpufreq: Remove schedutil dependency on Intel/AMD P-State

View File

@@ -1,4 +1,4 @@
From 48c8812a4cea0190a037757589443f3103c610ba Mon Sep 17 00:00:00 2001
From d00df0f150c9d04cd229d42e0af906db3dfb5190 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Wed, 15 Jan 2020 20:43:56 -0600
Subject: ZEN: intel-pstate: Implement "enable" parameter
@@ -30,7 +30,7 @@ selection.
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2254,6 +2254,9 @@
@@ -2283,6 +2283,9 @@
disable
Do not enable intel_pstate as the default
scaling driver for the supported processors
@@ -42,7 +42,7 @@ selection.
governors layer of cpufreq and provides it own
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -3817,6 +3817,8 @@ static int __init intel_pstate_setup(cha
@@ -3827,6 +3827,8 @@ static int __init intel_pstate_setup(cha
if (!strcmp(str, "disable"))
no_load = 1;

View File

@@ -1,4 +1,4 @@
From bbc56fdeaa2017d0bbed05e1e832e6d7e4bdd6e0 Mon Sep 17 00:00:00 2001
From f03da22e562a7d65a97926a76f61daeef8a1eb0d Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Fri, 15 Mar 2024 12:36:51 -0500
Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with
@@ -13,7 +13,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -164,6 +164,7 @@ struct amdgpu_watchdog_timer {
@@ -160,6 +160,7 @@ struct amdgpu_watchdog_timer {
*/
extern int amdgpu_modeset;
extern unsigned int amdgpu_vram_limit;
@@ -23,7 +23,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with
extern int amdgpu_gtt_size;
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -138,6 +138,7 @@ enum AMDGPU_DEBUG_MASK {
@@ -139,6 +139,7 @@ enum AMDGPU_DEBUG_MASK {
};
unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -31,7 +31,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with
int amdgpu_vis_vram_limit;
int amdgpu_gart_size = -1; /* auto */
int amdgpu_gtt_size = -1; /* auto */
@@ -262,6 +263,15 @@ struct amdgpu_watchdog_timer amdgpu_watc
@@ -258,6 +259,15 @@ struct amdgpu_watchdog_timer amdgpu_watc
};
/**
@@ -49,7 +49,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with
*/
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -3276,6 +3276,9 @@ static ssize_t amdgpu_hwmon_show_power_c
@@ -3180,6 +3180,9 @@ static ssize_t amdgpu_hwmon_show_power_c
struct device_attribute *attr,
char *buf)
{
@@ -61,7 +61,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2793,7 +2793,10 @@ int smu_get_power_limit(void *handle,
@@ -2823,7 +2823,10 @@ int smu_get_power_limit(void *handle,
*limit = smu->max_power_limit;
break;
case SMU_PPT_LIMIT_MIN:
@@ -73,7 +73,7 @@ Subject: ZEN: drm/amdgpu/pm: Allow override of min_power_limit with
break;
default:
return -EINVAL;
@@ -2817,7 +2820,14 @@ static int smu_set_power_limit(void *han
@@ -2847,7 +2850,14 @@ static int smu_set_power_limit(void *han
if (smu->ppt_funcs->set_power_limit)
return smu->ppt_funcs->set_power_limit(smu, limit_type, limit);

View File

@@ -1,4 +1,4 @@
From 2cceda3c699f19f9c2f287614db2fe5dd009f73a Mon Sep 17 00:00:00 2001
From 5f93b67c4e2fa81be5cee3edd8ec056407d25f26 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Sun, 19 Apr 2020 19:59:18 -0700
Subject: ZEN: mm: Stop kswapd early when nothing's waiting for it to free
@@ -43,14 +43,14 @@ Contains:
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -739,6 +739,7 @@ extern void post_alloc_hook(struct page
@@ -741,6 +741,7 @@ void post_alloc_hook(struct page *page,
extern bool free_pages_prepare(struct page *page, unsigned int order);
extern int user_min_free_kbytes;
+extern atomic_long_t kswapd_waiters;
void free_unref_page(struct page *page, unsigned int order);
void free_unref_folios(struct folio_batch *fbatch);
struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
nodemask_t *);
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,8 @@ typedef int __bitwise fpi_t;
@@ -102,7 +102,7 @@ Contains:
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6385,7 +6385,7 @@ retry:
@@ -6382,7 +6382,7 @@ retry:
return 0;
}
@@ -111,7 +111,7 @@ Contains:
{
struct zone *zone;
unsigned long pfmemalloc_reserve = 0;
@@ -6414,6 +6414,10 @@ static bool allow_direct_reclaim(pg_data
@@ -6411,6 +6411,10 @@ static bool allow_direct_reclaim(pg_data
wmark_ok = free_pages > pfmemalloc_reserve / 2;
@@ -122,7 +122,7 @@ Contains:
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
@@ -6479,7 +6483,7 @@ static bool throttle_direct_reclaim(gfp_
@@ -6476,7 +6480,7 @@ static bool throttle_direct_reclaim(gfp_
/* Throttle based on the first usable node */
pgdat = zone->zone_pgdat;
@@ -131,7 +131,7 @@ Contains:
goto out;
break;
}
@@ -6501,11 +6505,14 @@ static bool throttle_direct_reclaim(gfp_
@@ -6498,11 +6502,14 @@ static bool throttle_direct_reclaim(gfp_
*/
if (!(gfp_mask & __GFP_FS))
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
@@ -148,7 +148,7 @@ Contains:
if (fatal_signal_pending(current))
return true;
@@ -7008,14 +7015,14 @@ restart:
@@ -7005,14 +7012,14 @@ restart:
* able to safely make forward progress. Wake them
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&

View File

@@ -1,4 +1,4 @@
From 1ec451a4bbac7cc00b59f8ca504d6a8898615880 Mon Sep 17 00:00:00 2001
From 80b06f0f0bba019632e40c11231987a7e996c340 Mon Sep 17 00:00:00 2001
From: EXtremeExploit <pedro.montes.alcalde@gmail.com>
Date: Fri, 29 Nov 2024 13:05:27 -0300
Subject: ZEN: ahci: Disable staggered spinup by default

View File

@@ -1,4 +1,4 @@
From a31b09c511dd58e5032a3c941638207281b20ce4 Mon Sep 17 00:00:00 2001
From ac35b7af0aac6a9eb996962130a99c9af75c8b08 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 14 Dec 2024 11:23:18 -0600
Subject: ZEN: kernel/Kconfig.preempt: Remove EXPERT conditional on PREEMPT_RT
@@ -11,12 +11,12 @@ items hidden by enabling CONFIG_EXPERT.
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -69,7 +69,7 @@ config PREEMPT
@@ -88,7 +88,7 @@ endchoice
config PREEMPT_RT
bool "Fully Preemptible Kernel (Real-Time)"
- depends on EXPERT && ARCH_SUPPORTS_RT
+ depends on ARCH_SUPPORTS_RT
- depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST
+ depends on ARCH_SUPPORTS_RT && !COMPILE_TEST
select PREEMPTION
help
This option turns the kernel into a real-time kernel by replacing

View File

@@ -1,4 +1,4 @@
From 530ee9b20cf436bcbb3a632cb19fb5e13a29dde7 Mon Sep 17 00:00:00 2001
From 8bf253ea1b48fe101dc0161824b9a7d85f420b84 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Mon, 27 Jan 2020 18:10:06 +0100
Subject: ZEN: INTERACTIVE: Base config item
@@ -9,7 +9,7 @@ Subject: ZEN: INTERACTIVE: Base config item
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -154,6 +154,12 @@ config THREAD_INFO_IN_TASK
@@ -157,6 +157,12 @@ config THREAD_INFO_IN_TASK
menu "General setup"

View File

@@ -1,4 +1,4 @@
From d2f0a5801471b5f67344b2c92a2aa29f1aed626a Mon Sep 17 00:00:00 2001
From d3b2ab943a1de0838c4bd515dbed45f8f1c3c2cc Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Mon, 27 Jan 2020 18:11:05 +0100
Subject: ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices
@@ -10,7 +10,7 @@ Subject: ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -568,7 +568,11 @@ static struct elevator_type *elevator_ge
@@ -560,7 +560,11 @@ static struct elevator_type *elevator_ge
!blk_mq_is_shared_tags(q->tag_set->flags))
return NULL;
@@ -24,7 +24,7 @@ Subject: ZEN: INTERACTIVE: Use BFQ as the elevator for SQ devices
/*
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -160,6 +160,10 @@ config ZEN_INTERACTIVE
@@ -163,6 +163,10 @@ config ZEN_INTERACTIVE
help
Tunes the kernel for responsiveness at the cost of throughput and power usage.

View File

@@ -1,4 +1,4 @@
From 346251fa257245b3a06e37de863a1dbafbf2bbc2 Mon Sep 17 00:00:00 2001
From d941bedf16b95646be26364f00cf46c6649608a6 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Mon, 12 Dec 2022 00:03:03 +0100
Subject: ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices
@@ -10,7 +10,7 @@ Subject: ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -566,7 +566,13 @@ static struct elevator_type *elevator_ge
@@ -558,7 +558,13 @@ static struct elevator_type *elevator_ge
if (q->nr_hw_queues != 1 &&
!blk_mq_is_shared_tags(q->tag_set->flags))
@@ -26,7 +26,7 @@ Subject: ZEN: INTERACTIVE: Use Kyber as the elevator for MQ devices
return elevator_find_get("bfq");
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -163,6 +163,7 @@ config ZEN_INTERACTIVE
@@ -166,6 +166,7 @@ config ZEN_INTERACTIVE
--- Block Layer ----------------------------------------
Default scheduler for SQ..: mq-deadline -> bfq

View File

@@ -1,4 +1,4 @@
From 26fcaf58616b8cb3ce042e31c640594ea2fb5987 Mon Sep 17 00:00:00 2001
From d0ce01e1def080e52770f9a899476bb840807b37 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Mon, 27 Jan 2020 18:21:09 +0100
Subject: ZEN: INTERACTIVE: Enable background reclaim of hugepages
@@ -32,7 +32,7 @@ Reasoning and details in the original patch: https://lwn.net/Articles/711248/
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -165,6 +165,10 @@ config ZEN_INTERACTIVE
@@ -168,6 +168,10 @@ config ZEN_INTERACTIVE
Default scheduler for SQ..: mq-deadline -> bfq
Default scheduler for MQ..: none -> kyber
@@ -45,7 +45,7 @@ Reasoning and details in the original patch: https://lwn.net/Articles/711248/
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -65,7 +65,11 @@ unsigned long transparent_hugepage_flags
@@ -64,7 +64,11 @@ unsigned long transparent_hugepage_flags
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif

View File

@@ -1,4 +1,4 @@
From 9e5b04df7190ab4750ae3c67714fd537ef4d79f5 Mon Sep 17 00:00:00 2001
From f1fd33efd4b70519ff51b78c62d6fdf7d4f69620 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Tue, 31 Oct 2023 19:03:10 +0100
Subject: ZEN: INTERACTIVE: Tune EEVDF for interactivity
@@ -42,7 +42,7 @@ caused by rebalancing too many tasks at once.
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -169,6 +169,13 @@ config ZEN_INTERACTIVE
@@ -172,6 +172,13 @@ config ZEN_INTERACTIVE
Background-reclaim hugepages...: no -> yes
@@ -58,7 +58,7 @@ caused by rebalancing too many tasks at once.
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scalin
@@ -76,10 +76,19 @@ unsigned int sysctl_sched_tunable_scalin
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
@@ -78,7 +78,7 @@ caused by rebalancing too many tasks at once.
static int __init setup_sched_thermal_decay_shift(char *str)
{
@@ -121,8 +130,12 @@ int __weak arch_asym_cpu_priority(int cp
@@ -124,8 +133,12 @@ int __weak arch_asym_cpu_priority(int cp
*
* (default: 5 msec, units: microseconds)
*/
@@ -93,7 +93,7 @@ caused by rebalancing too many tasks at once.
/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2797,7 +2797,7 @@ extern void deactivate_task(struct rq *r
@@ -2837,7 +2837,7 @@ extern void deactivate_task(struct rq *r
extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);

View File

@@ -1,4 +1,4 @@
From f654ea11471f81ac7dd68467f552db25722df25e Mon Sep 17 00:00:00 2001
From 75f2a8831bd24a35d9853b11dabc06a138c5e445 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Mon, 27 Jan 2020 18:27:16 +0100
Subject: ZEN: INTERACTIVE: Tune ondemand governor for interactivity
@@ -75,7 +75,7 @@ Remove MuQSS cpufreq configuration.
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -176,6 +176,12 @@ config ZEN_INTERACTIVE
@@ -179,6 +179,12 @@ config ZEN_INTERACTIVE
Bandwidth slice size...........: 5 -> 3 ms
Task rebalancing threshold.....: 32 -> 8

View File

@@ -1,4 +1,4 @@
From f138e9762fd03612db5593f4c267c8f8b5799159 Mon Sep 17 00:00:00 2001
From b82d80a4195f179b9c0d0c80f662a7f42ed21ce8 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 5 Mar 2022 11:37:14 -0600
Subject: ZEN: INTERACTIVE: mm: Disable unevictable compaction
@@ -12,7 +12,7 @@ turn it off when CONFIG_ZEN_INTERACTIVE is set as well.
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -168,6 +168,7 @@ config ZEN_INTERACTIVE
@@ -171,6 +171,7 @@ config ZEN_INTERACTIVE
--- Virtual Memory Subsystem ---------------------------
Background-reclaim hugepages...: no -> yes
@@ -22,7 +22,7 @@ turn it off when CONFIG_ZEN_INTERACTIVE is set as well.
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -648,7 +648,7 @@ config COMPACTION
@@ -691,7 +691,7 @@ config COMPACTION
config COMPACT_UNEVICTABLE_DEFAULT
int
depends on COMPACTION

View File

@@ -1,4 +1,4 @@
From 76960c3806e7dfb618f49677cc84dafbfe48e4c4 Mon Sep 17 00:00:00 2001
From 7227af3e01f9ae5a2bcdc9aa652c973438938eb3 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Sat, 28 Mar 2020 13:06:28 -0700
Subject: ZEN: INTERACTIVE: mm: Disable watermark boosting by default
@@ -33,7 +33,7 @@ Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -169,6 +169,7 @@ config ZEN_INTERACTIVE
@@ -172,6 +172,7 @@ config ZEN_INTERACTIVE
Background-reclaim hugepages...: no -> yes
Compact unevictable............: yes -> no

View File

@@ -1,4 +1,4 @@
From fc3e794cecb686d4e05c6ed86fdf9b2dbd725ea9 Mon Sep 17 00:00:00 2001
From 91187cefc66b9c186a78d7bd996088fc74c66c99 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Wed, 20 Oct 2021 20:50:11 -0700
Subject: ZEN: INTERACTIVE: mm: Lower the non-hugetlbpage pageblock size to
@@ -47,7 +47,7 @@ Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -170,6 +170,7 @@ config ZEN_INTERACTIVE
@@ -173,6 +173,7 @@ config ZEN_INTERACTIVE
Background-reclaim hugepages...: no -> yes
Compact unevictable............: yes -> no
Watermark boost factor.........: 1.5 -> 0

View File

@@ -1,4 +1,4 @@
From be57a2710aef65116767d26930dd1251ff6e060f Mon Sep 17 00:00:00 2001
From 779648709dc797dac595e3007b4c7c3fee254537 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sat, 21 May 2022 15:15:09 -0500
Subject: ZEN: INTERACTIVE: dm-crypt: Disable workqueues for crypto ops
@@ -20,7 +20,7 @@ Fixes: https://github.com/zen-kernel/zen-kernel/issues/282
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3308,6 +3308,11 @@ static int crypt_ctr(struct dm_target *t
@@ -3305,6 +3305,11 @@ static int crypt_ctr(struct dm_target *t
goto bad;
}
@@ -34,7 +34,7 @@ Fixes: https://github.com/zen-kernel/zen-kernel/issues/282
goto bad;
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -164,6 +164,7 @@ config ZEN_INTERACTIVE
@@ -167,6 +167,7 @@ config ZEN_INTERACTIVE
Default scheduler for SQ..: mq-deadline -> bfq
Default scheduler for MQ..: none -> kyber

View File

@@ -1,4 +1,4 @@
From 41fe25c2e4e89c6afd35e3feb720e5a6797857d3 Mon Sep 17 00:00:00 2001
From ef87b1cb12134c34eed834315b03c4a6747b5716 Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Mon, 5 Sep 2022 11:35:20 -0500
Subject: ZEN: INTERACTIVE: mm/swap: Disable swap-in readahead
@@ -20,7 +20,7 @@ same change so Zen Kernel users benefit.
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -172,6 +172,7 @@ config ZEN_INTERACTIVE
@@ -175,6 +175,7 @@ config ZEN_INTERACTIVE
Compact unevictable............: yes -> no
Watermark boost factor.........: 1.5 -> 0
Pageblock order................: 10 -> 3
@@ -30,7 +30,7 @@ same change so Zen Kernel users benefit.
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1080,6 +1080,10 @@ void folio_batch_remove_exceptionals(str
@@ -1081,6 +1081,10 @@ void folio_batch_remove_exceptionals(str
*/
void __init swap_setup(void)
{
@@ -41,7 +41,7 @@ same change so Zen Kernel users benefit.
unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
/* Use a smaller cluster for small-memory machines */
@@ -1091,4 +1095,5 @@ void __init swap_setup(void)
@@ -1092,4 +1096,5 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/

View File

@@ -1,4 +1,4 @@
From 40de9c08129e2d8e182a166df2f1e823f70fa31d Mon Sep 17 00:00:00 2001
From cb33a6dc022faa07ac1e1cd544567b28a7e9afeb Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Sun, 19 Sep 2021 16:03:36 -0500
Subject: ZEN: INTERACTIVE: Document PDS/BMQ configuration
@@ -9,7 +9,7 @@ Subject: ZEN: INTERACTIVE: Document PDS/BMQ configuration
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -181,6 +181,11 @@ config ZEN_INTERACTIVE
@@ -184,6 +184,11 @@ config ZEN_INTERACTIVE
Bandwidth slice size...........: 5 -> 3 ms
Task rebalancing threshold.....: 32 -> 8

View File

@@ -1,194 +0,0 @@
From eacae6d88bcc8a925124f97b7788bb2bfac8b267 Mon Sep 17 00:00:00 2001
From: Vinay Banakar <vny@google.com>
Date: Mon, 20 Jan 2025 16:47:29 -0600
Subject: mm: Optimize TLB flushes during page reclaim
The current implementation in shrink_folio_list() performs full TLB
flushes and issues IPIs for each individual page being reclaimed. This
causes unnecessary overhead during memory reclaim, whether triggered
by madvise(MADV_PAGEOUT) or kswapd, especially in scenarios where
applications are actively moving cold pages to swap while maintaining
high performance requirements for hot pages.
The current code:
1. Clears PTE and unmaps each page individually
2. Performs a full TLB flush on all cores using the VMA (via CR3 write) or
issues individual TLB shootdowns (invlpg+invlpcid) for single-core usage
3. Submits each page individually to BIO
This approach results in:
- Excessive full TLB flushes across all cores
- Unnecessary IPI storms when processing multiple pages
- Suboptimal I/O submission patterns
I initially tried using selective TLB shootdowns (invlpg) instead of
full TLB flushes per each page to avoid interference with other
threads. However, this approach still required sending IPIs to all
cores for each page, which did not significantly improve application
throughput.
This patch instead optimizes the process by batching operations,
issuing one IPI per PMD instead of per page. This reduces interrupts
by a factor of 512 and enables batching page submissions to BIO. The
new approach:
1. Collect dirty pages that need to be written back
2. Issue a single TLB flush for all dirty pages in the batch
3. Process the collected pages for writebacks (submit to BIO)
Testing shows significant reduction in application throughput impact
during page-out operations. Applications maintain better performance
during memory reclaim, when triggered by explicit
madvise(MADV_PAGEOUT) calls.
I'd appreciate your feedback on this approach, especially on the
correctness of batched BIO submissions. Looking forward to your
comments.
Signed-off-by: Vinay Banakar <vny@google.com>
---
mm/vmscan.c | 120 ++++++++++++++++++++++++++++++++--------------------
1 file changed, 74 insertions(+), 46 deletions(-)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1053,6 +1053,7 @@ static unsigned int shrink_folio_list(st
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
+ LIST_HEAD(pageout_list);
unsigned int nr_reclaimed = 0, nr_demoted = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
@@ -1365,52 +1366,9 @@ retry:
if (!sc->may_writepage)
goto keep_locked;
- /*
- * Folio is dirty. Flush the TLB if a writable entry
- * potentially exists to avoid CPU writes after I/O
- * starts and then write it out here.
- */
- try_to_unmap_flush_dirty();
- switch (pageout(folio, mapping, &plug, folio_list)) {
- case PAGE_KEEP:
- goto keep_locked;
- case PAGE_ACTIVATE:
- /*
- * If shmem folio is split when writeback to swap,
- * the tail pages will make their own pass through
- * this function and be accounted then.
- */
- if (nr_pages > 1 && !folio_test_large(folio)) {
- sc->nr_scanned -= (nr_pages - 1);
- nr_pages = 1;
- }
- goto activate_locked;
- case PAGE_SUCCESS:
- if (nr_pages > 1 && !folio_test_large(folio)) {
- sc->nr_scanned -= (nr_pages - 1);
- nr_pages = 1;
- }
- stat->nr_pageout += nr_pages;
-
- if (folio_test_writeback(folio))
- goto keep;
- if (folio_test_dirty(folio))
- goto keep;
-
- /*
- * A synchronous write - probably a ramdisk. Go
- * ahead and try to reclaim the folio.
- */
- if (!folio_trylock(folio))
- goto keep;
- if (folio_test_dirty(folio) ||
- folio_test_writeback(folio))
- goto keep_locked;
- mapping = folio_mapping(folio);
- fallthrough;
- case PAGE_CLEAN:
- ; /* try to free the folio below */
- }
+ /* Add to pageout list for defered bio submissions */
+ list_add(&folio->lru, &pageout_list);
+ continue;
}
/*
@@ -1521,6 +1479,76 @@ keep:
}
/* 'folio_list' is always empty here */
+ if (!list_empty(&pageout_list)) {
+ /*
+ * Batch TLB flushes by flushing once before processing all dirty pages.
+ * Since we operate on one PMD at a time, this batches TLB flushes at
+ * PMD granularity rather than per-page, reducing IPIs.
+ */
+ struct address_space *mapping;
+ try_to_unmap_flush_dirty();
+
+ while (!list_empty(&pageout_list)) {
+ struct folio *folio = lru_to_folio(&pageout_list);
+ list_del(&folio->lru);
+
+ /* Recheck if page got reactivated */
+ if (folio_test_active(folio) ||
+ (folio_mapped(folio) && folio_test_young(folio)))
+ goto skip_pageout_locked;
+
+ mapping = folio_mapping(folio);
+ pageout_t pageout_res = pageout(folio, mapping, &plug, &pageout_list);
+ switch (pageout_res) {
+ case PAGE_KEEP:
+ goto skip_pageout_locked;
+ case PAGE_ACTIVATE:
+ goto skip_pageout_locked;
+ case PAGE_SUCCESS:
+ stat->nr_pageout += folio_nr_pages(folio);
+
+ if (folio_test_writeback(folio) ||
+ folio_test_dirty(folio))
+ goto skip_pageout;
+
+ /*
+ * A synchronous write - probably a ramdisk. Go
+ * ahead and try to reclaim the folio.
+ */
+ if (!folio_trylock(folio))
+ goto skip_pageout;
+ if (folio_test_dirty(folio) ||
+ folio_test_writeback(folio))
+ goto skip_pageout_locked;
+
+ // Try to free the page
+ if (!mapping ||
+ !__remove_mapping(mapping, folio, true,
+ sc->target_mem_cgroup))
+ goto skip_pageout_locked;
+
+ nr_reclaimed += folio_nr_pages(folio);
+ folio_unlock(folio);
+ continue;
+
+ case PAGE_CLEAN:
+ if (!mapping ||
+ !__remove_mapping(mapping, folio, true,
+ sc->target_mem_cgroup))
+ goto skip_pageout_locked;
+
+ nr_reclaimed += folio_nr_pages(folio);
+ folio_unlock(folio);
+ continue;
+ }
+
+skip_pageout_locked:
+ folio_unlock(folio);
+skip_pageout:
+ list_add(&folio->lru, &ret_folios);
+ }
+ }
+
/* Migrate folios selected for demotion */
nr_demoted = demote_folio_list(&demote_folios, pgdat);
nr_reclaimed += nr_demoted;