1
0

release 6.15.2 (preliminary)

This commit is contained in:
2025-06-18 12:24:58 +03:00
parent 4d2691343a
commit 43dc655d2e
242 changed files with 7729 additions and 32303 deletions

View File

@@ -1,176 +0,0 @@
From 4506de20739ac4726a258faa98609a552184d2d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20Gonz=C3=A1lez=20Collado?=
<sergio.collado@gmail.com>
Date: Sun, 2 Mar 2025 23:15:18 +0100
Subject: Kunit to check the longest symbol length
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The longest length of a symbol (KSYM_NAME_LEN) was increased to 512
in the reference [1]. This patch adds kunit test suite to check the longest
symbol length. These tests verify that the longest symbol length defined
is supported.
This test can also help other efforts for longer symbol length,
like [2].
The test suite defines one symbol with the longest possible length.
The first test verify that functions with names of the created
symbol, can be called or not.
The second test, verify that the symbols are created (or
not) in the kernel symbol table.
[1] https://lore.kernel.org/lkml/20220802015052.10452-6-ojeda@kernel.org/
[2] https://lore.kernel.org/lkml/20240605032120.3179157-1-song@kernel.org/
Tested-by: Martin Rodriguez Reboredo <yakoyoku@gmail.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Rae Moar <rmoar@google.com>
Signed-off-by: Sergio González Collado <sergio.collado@gmail.com>
Link: https://github.com/Rust-for-Linux/linux/issues/504
Source: https://lore.kernel.org/rust-for-linux/20250302221518.76874-1-sergio.collado@gmail.com/
Cherry-picked-for: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/63
---
arch/x86/tools/insn_decoder_test.c | 3 +-
lib/Kconfig.debug | 9 ++++
lib/Makefile | 2 +
lib/longest_symbol_kunit.c | 82 ++++++++++++++++++++++++++++++
4 files changed, 95 insertions(+), 1 deletion(-)
create mode 100644 lib/longest_symbol_kunit.c
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -10,6 +10,7 @@
#include <assert.h>
#include <unistd.h>
#include <stdarg.h>
+#include <linux/kallsyms.h>
#define unlikely(cond) (cond)
@@ -106,7 +107,7 @@ static void parse_args(int argc, char **
}
}
-#define BUFSIZE 256
+#define BUFSIZE (256 + KSYM_NAME_LEN)
int main(int argc, char **argv)
{
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2838,6 +2838,15 @@ config FORTIFY_KUNIT_TEST
by the str*() and mem*() family of functions. For testing runtime
traps of FORTIFY_SOURCE, see LKDTM's "FORTIFY_*" tests.
+config LONGEST_SYM_KUNIT_TEST
+ tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS
+ depends on KUNIT && KPROBES
+ default KUNIT_ALL_TESTS
+ help
+ Tests the longest symbol possible
+
+ If unsure, say N.
+
config HW_BREAKPOINT_KUNIT_TEST
bool "Test hw_breakpoint constraints accounting" if !KUNIT_ALL_TESTS
depends on HAVE_HW_BREAKPOINT
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -398,6 +398,8 @@ obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fort
obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o
obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o
obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o
+obj-$(CONFIG_LONGEST_SYM_KUNIT_TEST) += longest_symbol_kunit.o
+CFLAGS_longest_symbol_kunit.o += $(call cc-disable-warning, missing-prototypes)
obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o
--- /dev/null
+++ b/lib/longest_symbol_kunit.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test the longest symbol length. Execute with:
+ * ./tools/testing/kunit/kunit.py run longest-symbol
+ * --arch=x86_64 --kconfig_add CONFIG_KPROBES=y --kconfig_add CONFIG_MODULES=y
+ * --kconfig_add CONFIG_RETPOLINE=n --kconfig_add CONFIG_CFI_CLANG=n
+ * --kconfig_add CONFIG_MITIGATION_RETPOLINE=n
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <kunit/test.h>
+#include <linux/stringify.h>
+#include <linux/kprobes.h>
+#include <linux/kallsyms.h>
+
+#define DI(name) s##name##name
+#define DDI(name) DI(n##name##name)
+#define DDDI(name) DDI(n##name##name)
+#define DDDDI(name) DDDI(n##name##name)
+#define DDDDDI(name) DDDDI(n##name##name)
+
+/*Generate a symbol whose name length is 511 */
+#define LONGEST_SYM_NAME DDDDDI(g1h2i3j4k5l6m7n)
+
+#define RETURN_LONGEST_SYM 0xAAAAA
+
+noinline int LONGEST_SYM_NAME(void);
+noinline int LONGEST_SYM_NAME(void)
+{
+ return RETURN_LONGEST_SYM;
+}
+
+_Static_assert(sizeof(__stringify(LONGEST_SYM_NAME)) == KSYM_NAME_LEN,
+"Incorrect symbol length found. Expected KSYM_NAME_LEN: "
+__stringify(KSYM_NAME_LEN) ", but found: "
+__stringify(sizeof(LONGEST_SYM_NAME)));
+
+static void test_longest_symbol(struct kunit *test)
+{
+ KUNIT_EXPECT_EQ(test, RETURN_LONGEST_SYM, LONGEST_SYM_NAME());
+};
+
+static void test_longest_symbol_kallsyms(struct kunit *test)
+{
+ unsigned long (*kallsyms_lookup_name)(const char *name);
+ static int (*longest_sym)(void);
+
+ struct kprobe kp = {
+ .symbol_name = "kallsyms_lookup_name",
+ };
+
+ if (register_kprobe(&kp) < 0) {
+ pr_info("%s: kprobe not registered", __func__);
+ KUNIT_FAIL(test, "test_longest_symbol kallsyms: kprobe not registered\n");
+ return;
+ }
+
+ kunit_warn(test, "test_longest_symbol kallsyms: kprobe registered\n");
+ kallsyms_lookup_name = (unsigned long (*)(const char *name))kp.addr;
+ unregister_kprobe(&kp);
+
+ longest_sym =
+ (void *) kallsyms_lookup_name(__stringify(LONGEST_SYM_NAME));
+ KUNIT_EXPECT_EQ(test, RETURN_LONGEST_SYM, longest_sym());
+};
+
+static struct kunit_case longest_symbol_test_cases[] = {
+ KUNIT_CASE(test_longest_symbol),
+ KUNIT_CASE(test_longest_symbol_kallsyms),
+ {}
+};
+
+static struct kunit_suite longest_symbol_test_suite = {
+ .name = "longest-symbol",
+ .test_cases = longest_symbol_test_cases,
+};
+kunit_test_suite(longest_symbol_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Test the longest symbol length");
+MODULE_AUTHOR("Sergio González Collado");

View File

@@ -0,0 +1,70 @@
From cda8b1022f32bb7a917148f75f4641e7a5b3e729 Mon Sep 17 00:00:00 2001
From: Jinliang Zheng <alexjlzheng@tencent.com>
Date: Tue, 15 Apr 2025 17:02:32 +0800
Subject: mm: fix ratelimit_pages update error in dirty_ratio_handler()
In dirty_ratio_handler(), vm_dirty_bytes must be set to zero before
calling writeback_set_ratelimit(), as global_dirty_limits() always
prioritizes the value of vm_dirty_bytes.
It's domain_dirty_limits() that's relevant here, not node_dirty_ok:
dirty_ratio_handler
writeback_set_ratelimit
global_dirty_limits(&dirty_thresh) <- ratelimit_pages based on dirty_thresh
domain_dirty_limits
if (bytes) <- bytes = vm_dirty_bytes <--------+
thresh = f1(bytes) <- prioritizes vm_dirty_bytes |
else |
thresh = f2(ratio) |
ratelimit_pages = f3(dirty_thresh) |
vm_dirty_bytes = 0 <- it's late! ---------------------+
This causes ratelimit_pages to still use the value calculated based on
vm_dirty_bytes, which is wrong now.
The impact visible to userspace is difficult to capture directly because
there is no procfs/sysfs interface exported to user space. However, it
will have a real impact on the balance of dirty pages.
For example:
1. On default, we have vm_dirty_ratio=40, vm_dirty_bytes=0
2. echo 8192 > dirty_bytes, then vm_dirty_bytes=8192,
vm_dirty_ratio=0, and ratelimit_pages is calculated based on
vm_dirty_bytes now.
3. echo 20 > dirty_ratio, then since vm_dirty_bytes is not reset to
zero when writeback_set_ratelimit() -> global_dirty_limits() ->
domain_dirty_limits() is called, reallimit_pages is still calculated
based on vm_dirty_bytes instead of vm_dirty_ratio. This does not
conform to the actual intent of the user.
Link: https://lkml.kernel.org/r/20250415090232.7544-1-alexjlzheng@tencent.com
Fixes: 9d823e8f6b1b ("writeback: per task dirty rate limit")
Signed-off-by: Jinliang Zheng <alexjlzheng@tencent.com>
Reviewed-by: MengEn Sun <mengensun@tencent.com>
Cc: Andrea Righi <andrea@betterlinux.com>
Cc: Fenggaung Wu <fengguang.wu@intel.com>
Cc: Jinliang Zheng <alexjlzheng@tencent.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/page-writeback.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -520,8 +520,8 @@ static int dirty_ratio_handler(const str
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}

View File

@@ -0,0 +1,179 @@
From 30a724581b5037176f6492359c189ebb180ccf1f Mon Sep 17 00:00:00 2001
From: GONG Ruiqi <gongruiqi1@huawei.com>
Date: Sun, 27 Apr 2025 10:53:03 +0800
Subject: vgacon: Add check for vc_origin address range in vgacon_scroll()
Our in-house Syzkaller reported the following BUG (twice), which we
believed was the same issue with [1]:
==================================================================
BUG: KASAN: slab-out-of-bounds in vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
Read of size 2 at addr ffff88800f5bef60 by task syz.7.2620/12393
...
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x72/0xa0 lib/dump_stack.c:106
print_address_description.constprop.0+0x6b/0x3d0 mm/kasan/report.c:364
print_report+0xba/0x280 mm/kasan/report.c:475
kasan_report+0xa9/0xe0 mm/kasan/report.c:588
vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
vcs_write_buf_noattr drivers/tty/vt/vc_screen.c:493 [inline]
vcs_write+0x586/0x840 drivers/tty/vt/vc_screen.c:690
vfs_write+0x219/0x960 fs/read_write.c:584
ksys_write+0x12e/0x260 fs/read_write.c:639
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
entry_SYSCALL_64_after_hwframe+0x78/0xe2
...
</TASK>
Allocated by task 5614:
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
kasan_set_track+0x25/0x30 mm/kasan/common.c:52
____kasan_kmalloc mm/kasan/common.c:374 [inline]
__kasan_kmalloc+0x8f/0xa0 mm/kasan/common.c:383
kasan_kmalloc include/linux/kasan.h:201 [inline]
__do_kmalloc_node mm/slab_common.c:1007 [inline]
__kmalloc+0x62/0x140 mm/slab_common.c:1020
kmalloc include/linux/slab.h:604 [inline]
kzalloc include/linux/slab.h:721 [inline]
vc_do_resize+0x235/0xf40 drivers/tty/vt/vt.c:1193
vgacon_adjust_height+0x2d4/0x350 drivers/video/console/vgacon.c:1007
vgacon_font_set+0x1f7/0x240 drivers/video/console/vgacon.c:1031
con_font_set drivers/tty/vt/vt.c:4628 [inline]
con_font_op+0x4da/0xa20 drivers/tty/vt/vt.c:4675
vt_k_ioctl+0xa10/0xb30 drivers/tty/vt/vt_ioctl.c:474
vt_ioctl+0x14c/0x1870 drivers/tty/vt/vt_ioctl.c:752
tty_ioctl+0x655/0x1510 drivers/tty/tty_io.c:2779
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:871 [inline]
__se_sys_ioctl+0x12d/0x190 fs/ioctl.c:857
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
entry_SYSCALL_64_after_hwframe+0x78/0xe2
Last potentially related work creation:
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
__kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
__call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
__sock_release+0xb5/0x270 net/socket.c:663
sock_close+0x1e/0x30 net/socket.c:1425
__fput+0x408/0xab0 fs/file_table.c:384
__fput_sync+0x4c/0x60 fs/file_table.c:465
__do_sys_close fs/open.c:1580 [inline]
__se_sys_close+0x68/0xd0 fs/open.c:1565
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
entry_SYSCALL_64_after_hwframe+0x78/0xe2
Second to last potentially related work creation:
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
__kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
__call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
__sock_release+0xb5/0x270 net/socket.c:663
sock_close+0x1e/0x30 net/socket.c:1425
__fput+0x408/0xab0 fs/file_table.c:384
task_work_run+0x154/0x240 kernel/task_work.c:239
exit_task_work include/linux/task_work.h:45 [inline]
do_exit+0x8e5/0x1320 kernel/exit.c:874
do_group_exit+0xcd/0x280 kernel/exit.c:1023
get_signal+0x1675/0x1850 kernel/signal.c:2905
arch_do_signal_or_restart+0x80/0x3b0 arch/x86/kernel/signal.c:310
exit_to_user_mode_loop kernel/entry/common.c:111 [inline]
exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
__syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline]
syscall_exit_to_user_mode+0x1b3/0x1e0 kernel/entry/common.c:218
do_syscall_64+0x66/0x110 arch/x86/entry/common.c:87
entry_SYSCALL_64_after_hwframe+0x78/0xe2
The buggy address belongs to the object at ffff88800f5be000
which belongs to the cache kmalloc-2k of size 2048
The buggy address is located 2656 bytes to the right of
allocated 1280-byte region [ffff88800f5be000, ffff88800f5be500)
...
Memory state around the buggy address:
ffff88800f5bee00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88800f5bee80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff88800f5bef00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
^
ffff88800f5bef80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88800f5bf000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==================================================================
By analyzing the vmcore, we found that vc->vc_origin was somehow placed
one line prior to vc->vc_screenbuf when vc was in KD_TEXT mode, and
further writings to /dev/vcs caused out-of-bounds reads (and writes
right after) in vcs_write_buf_noattr().
Our further experiments show that in most cases, vc->vc_origin equals to
vga_vram_base when the console is in KD_TEXT mode, and it's around
vc->vc_screenbuf for the KD_GRAPHICS mode. But via triggerring a
TIOCL_SETVESABLANK ioctl beforehand, we can make vc->vc_origin be around
vc->vc_screenbuf while the console is in KD_TEXT mode, and then by
writing the special 'ESC M' control sequence to the tty certain times
(depends on the value of `vc->state.y - vc->vc_top`), we can eventually
move vc->vc_origin prior to vc->vc_screenbuf. Here's the PoC, tested on
QEMU:
```
int main() {
const int RI_NUM = 10; // should be greater than `vc->state.y - vc->vc_top`
int tty_fd, vcs_fd;
const char *tty_path = "/dev/tty0";
const char *vcs_path = "/dev/vcs";
const char escape_seq[] = "\x1bM"; // ESC + M
const char trigger_seq[] = "Let's trigger an OOB write.";
struct vt_sizes vt_size = { 70, 2 };
int blank = TIOCL_BLANKSCREEN;
tty_fd = open(tty_path, O_RDWR);
char vesa_mode[] = { TIOCL_SETVESABLANK, 1 };
ioctl(tty_fd, TIOCLINUX, vesa_mode);
ioctl(tty_fd, TIOCLINUX, &blank);
ioctl(tty_fd, VT_RESIZE, &vt_size);
for (int i = 0; i < RI_NUM; ++i)
write(tty_fd, escape_seq, sizeof(escape_seq) - 1);
vcs_fd = open(vcs_path, O_RDWR);
write(vcs_fd, trigger_seq, sizeof(trigger_seq));
close(vcs_fd);
close(tty_fd);
return 0;
}
```
To solve this problem, add an address range validation check in
vgacon_scroll(), ensuring vc->vc_origin never precedes vc_screenbuf.
Reported-by: syzbot+9c09fda97a1a65ea859b@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=9c09fda97a1a65ea859b [1]
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Co-developed-by: Yi Yang <yiyang13@huawei.com>
Signed-off-by: Yi Yang <yiyang13@huawei.com>
Signed-off-by: GONG Ruiqi <gongruiqi1@huawei.com>
Signed-off-by: Helge Deller <deller@gmx.de>
---
drivers/video/console/vgacon.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -1168,7 +1168,7 @@ static bool vgacon_scroll(struct vc_data
c->vc_screenbuf_size - delta);
c->vc_origin = vga_vram_end - c->vc_screenbuf_size;
vga_rolled_over = 0;
- } else
+ } else if (oldo - delta >= (unsigned long)c->vc_screenbuf)
c->vc_origin -= delta;
c->vc_scr_end = c->vc_origin + c->vc_screenbuf_size;
scr_memsetw((u16 *) (c->vc_origin), c->vc_video_erase_char,

View File

@@ -1,36 +0,0 @@
From b5a4b82efd19d0687a5582a58f6830bf714e34fc Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 18 Mar 2025 15:32:30 -0700
Subject: x86/tools: Drop duplicate unlikely() definition in
insn_decoder_test.c
After commit c104c16073b7 ("Kunit to check the longest symbol length"),
there is a warning when building with clang because there is now a
definition of unlikely from compiler.h in tools/include/linux, which
conflicts with the one in the instruction decoder selftest:
arch/x86/tools/insn_decoder_test.c:15:9: warning: 'unlikely' macro redefined [-Wmacro-redefined]
Remove the second unlikely() definition, as it is no longer necessary,
clearing up the warning.
Fixes: c104c16073b7 ("Kunit to check the longest symbol length")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20250318-x86-decoder-test-fix-unlikely-redef-v1-1-74c84a7bf05b@kernel.org
---
arch/x86/tools/insn_decoder_test.c | 2 --
1 file changed, 2 deletions(-)
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -12,8 +12,6 @@
#include <stdarg.h>
#include <linux/kallsyms.h>
-#define unlikely(cond) (cond)
-
#include <asm/insn.h>
#include <inat.c>
#include <insn.c>

View File

@@ -0,0 +1,102 @@
From 5cf26cf9fd9c11cb1543aac026f8928829895663 Mon Sep 17 00:00:00 2001
From: Murad Masimov <m.masimov@mt-integration.ru>
Date: Mon, 28 Apr 2025 18:34:06 +0300
Subject: fbdev: Fix do_register_framebuffer to prevent null-ptr-deref in
fb_videomode_to_var
If fb_add_videomode() in do_register_framebuffer() fails to allocate
memory for fb_videomode, it will later lead to a null-ptr dereference in
fb_videomode_to_var(), as the fb_info is registered while not having the
mode in modelist that is expected to be there, i.e. the one that is
described in fb_info->var.
================================================================
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
Call Trace:
display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
resize_screen drivers/tty/vt/vt.c:1176 [inline]
vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
vfs_ioctl fs/ioctl.c:48 [inline]
__do_sys_ioctl fs/ioctl.c:753 [inline]
__se_sys_ioctl fs/ioctl.c:739 [inline]
__x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x67/0xd1
================================================================
Even though fbcon_init() checks beforehand if fb_match_mode() in
var_to_display() fails, it can not prevent the panic because fbcon_init()
does not return error code. Considering this and the comment in the code
about fb_match_mode() returning NULL - "This should not happen" - it is
better to prevent registering the fb_info if its mode was not set
successfully. Also move fb_add_videomode() closer to the beginning of
do_register_framebuffer() to avoid having to do the cleanup on fail.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
Signed-off-by: Helge Deller <deller@gmx.de>
---
drivers/video/fbdev/core/fbmem.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -388,7 +388,7 @@ static int fb_check_foreignness(struct f
static int do_register_framebuffer(struct fb_info *fb_info)
{
- int i;
+ int i, err = 0;
struct fb_videomode mode;
if (fb_check_foreignness(fb_info))
@@ -397,10 +397,18 @@ static int do_register_framebuffer(struc
if (num_registered_fb == FB_MAX)
return -ENXIO;
- num_registered_fb++;
for (i = 0 ; i < FB_MAX; i++)
if (!registered_fb[i])
break;
+
+ if (!fb_info->modelist.prev || !fb_info->modelist.next)
+ INIT_LIST_HEAD(&fb_info->modelist);
+
+ fb_var_to_videomode(&mode, &fb_info->var);
+ err = fb_add_videomode(&mode, &fb_info->modelist);
+ if (err < 0)
+ return err;
+
fb_info->node = i;
refcount_set(&fb_info->count, 1);
mutex_init(&fb_info->lock);
@@ -426,16 +434,12 @@ static int do_register_framebuffer(struc
if (bitmap_empty(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT))
bitmap_fill(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT);
- if (!fb_info->modelist.prev || !fb_info->modelist.next)
- INIT_LIST_HEAD(&fb_info->modelist);
-
if (fb_info->skip_vt_switch)
pm_vt_switch_required(fb_info->device, false);
else
pm_vt_switch_required(fb_info->device, true);
- fb_var_to_videomode(&mode, &fb_info->var);
- fb_add_videomode(&mode, &fb_info->modelist);
+ num_registered_fb++;
registered_fb[i] = fb_info;
#ifdef CONFIG_GUMSTIX_AM200EPD

View File

@@ -1,34 +0,0 @@
From e3d18eed972374cfbac1e58cf109209b07c1e27e Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@natalenko.name>
Date: Tue, 8 Apr 2025 12:02:36 +0200
Subject: ice: mark ice_write_prof_mask_reg() as noinline
The following happens during build:
```
drivers/net/ethernet/intel/ice/ice.o: error: objtool: ice_free_prof_mask.isra.0() falls through to next function ice_free_flow_profs.cold()
drivers/net/ethernet/intel/ice/ice.o: error: objtool: ice_free_prof_mask.isra.0.cold() is missing an ELF size annotation
```
Marking ice_write_prof_mask_reg() as noinline solves this, although I'm
not sure if this is a proper solution. Apparently, this happens with -O3
only, the `default` case is never reachable, but the optimiser generates
branching to a random code location.
Link: https://lore.kernel.org/lkml/6nzfoyak4cewjpmdflg5yi7jh2mqqdsfqgljoolx5lvdo2p65p@rwjfl7cqkfoo/
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
---
drivers/net/ethernet/intel/ice/ice_flex_pipe.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
+++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
@@ -1404,7 +1404,7 @@ static int ice_prof_inc_ref(struct ice_h
* @idx: index of the FV which will use the mask
* @mask: the 16-bit mask
*/
-static void
+static noinline void
ice_write_prof_mask_reg(struct ice_hw *hw, enum ice_block blk, u16 mask_idx,
u16 idx, u16 mask)
{

View File

@@ -0,0 +1,65 @@
From 54c7f478f1a9d58f5609a48d461c7d495bb8301a Mon Sep 17 00:00:00 2001
From: Murad Masimov <m.masimov@mt-integration.ru>
Date: Mon, 28 Apr 2025 18:34:07 +0300
Subject: fbdev: Fix fb_set_var to prevent null-ptr-deref in
fb_videomode_to_var
If fb_add_videomode() in fb_set_var() fails to allocate memory for
fb_videomode, later it may lead to a null-ptr dereference in
fb_videomode_to_var(), as the fb_info is registered while not having the
mode in modelist that is expected to be there, i.e. the one that is
described in fb_info->var.
================================================================
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
Call Trace:
display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
resize_screen drivers/tty/vt/vt.c:1176 [inline]
vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
vfs_ioctl fs/ioctl.c:48 [inline]
__do_sys_ioctl fs/ioctl.c:753 [inline]
__se_sys_ioctl fs/ioctl.c:739 [inline]
__x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x67/0xd1
================================================================
The reason is that fb_info->var is being modified in fb_set_var(), and
then fb_videomode_to_var() is called. If it fails to add the mode to
fb_info->modelist, fb_set_var() returns error, but does not restore the
old value of fb_info->var. Restore fb_info->var on failure the same way
it is done earlier in the function.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
Signed-off-by: Helge Deller <deller@gmx.de>
---
drivers/video/fbdev/core/fbmem.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -328,8 +328,10 @@ fb_set_var(struct fb_info *info, struct
!list_empty(&info->modelist))
ret = fb_add_videomode(&mode, &info->modelist);
- if (ret)
+ if (ret) {
+ info->var = old_var;
return ret;
+ }
event.info = info;
event.data = &mode;

View File

@@ -1,40 +0,0 @@
From e56acee381a8e07edf1920fb58f3166f911b6e5c Mon Sep 17 00:00:00 2001
From: Lingbo Kong <quic_lingbok@quicinc.com>
Date: Wed, 26 Feb 2025 19:31:18 +0800
Subject: wifi: ath12k: Abort scan before removing link interface to prevent
duplicate deletion
Currently, when ath12k performs the remove link interface operation, if
there is an ongoing scan operation on the arvif, ath12k may execute the
remove link interface operation multiple times on the same arvif. This
occurs because, during the remove link operation, if a scan operation is
present on the arvif, ath12k may receive a WMI_SCAN_EVENT_COMPLETED event
from the firmware. Upon receiving this event, ath12k will continue to
execute the ath12k_scan_vdev_clean_work() function, performing the remove
link interface operation on the same arvif again.
To address this issue, before executing the remove link interface
operation, ath12k needs to check if there is an ongoing scan operation on
the current arvif. If such an operation exists, it should be aborted.
Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3
Signed-off-by: Lingbo Kong <quic_lingbok@quicinc.com>
---
drivers/net/wireless/ath/ath12k/mac.c | 5 +++++
1 file changed, 5 insertions(+)
--- a/drivers/net/wireless/ath/ath12k/mac.c
+++ b/drivers/net/wireless/ath/ath12k/mac.c
@@ -9395,6 +9395,11 @@ ath12k_mac_op_unassign_vif_chanctx(struc
ar->num_started_vdevs == 1 && ar->monitor_vdev_created)
ath12k_mac_monitor_stop(ar);
+ if (ar->scan.arvif == arvif && ar->scan.state == ATH12K_SCAN_RUNNING) {
+ ath12k_scan_abort(ar);
+ ar->scan.arvif = NULL;
+ }
+
ath12k_mac_remove_link_interface(hw, arvif);
ath12k_mac_unassign_link_vif(arvif);
}

View File

@@ -1,49 +0,0 @@
From 8d0e02f81d08c7b1e082028af0f55a22e7e1dfb2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 15 Apr 2025 10:22:04 +0200
Subject: Kconfig: switch CONFIG_SYSFS_SYCALL default to n
This odd system call will be removed in the future. Let's decouple it
from CONFIG_EXPERT and switch the default to n as a first step.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
init/Kconfig | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1603,6 +1603,16 @@ config SYSCTL_ARCH_UNALIGN_ALLOW
the unaligned access emulation.
see arch/parisc/kernel/unaligned.c for reference
+config SYSFS_SYSCALL
+ bool "Sysfs syscall support"
+ default n
+ help
+ sys_sysfs is an obsolete system call no longer supported in libc.
+ Note that disabling this option is more secure but might break
+ compatibility with some systems.
+
+ If unsure say N here.
+
config HAVE_PCSPKR_PLATFORM
bool
@@ -1647,16 +1657,6 @@ config SGETMASK_SYSCALL
If unsure, leave the default option here.
-config SYSFS_SYSCALL
- bool "Sysfs syscall support" if EXPERT
- default y
- help
- sys_sysfs is an obsolete system call no longer supported in libc.
- Note that disabling this option is more secure but might break
- compatibility with some systems.
-
- If unsure say Y here.
-
config FHANDLE
bool "open by fhandle syscalls" if EXPERT
select EXPORTFS

View File

@@ -0,0 +1,113 @@
From 9cb2f9d210f915aabe54c5061d84f3fbe93c71ea Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Apr 2025 11:54:15 +0200
Subject: anon_inode: use a proper mode internally
This allows the VFS to not trip over anonymous inodes and we can add
asserts based on the mode into the vfs. When we report it to userspace
we can simply hide the mode to avoid regressions. I've audited all
direct callers of alloc_anon_inode() and only secretmen overrides i_mode
and i_op inode operations but it already uses a regular file.
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-1-53a44c20d44e@kernel.org
Fixes: af153bb63a336 ("vfs: catch invalid modes in may_open()")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Cc: stable@vger.kernel.org # all LTS kernels
Reported-by: syzbot+5d8e79d323a13aa0b248@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/67ed3fb3.050a0220.14623d.0009.GAE@google.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/anon_inodes.c | 36 ++++++++++++++++++++++++++++++++++++
fs/internal.h | 3 +++
fs/libfs.c | 8 +++++++-
3 files changed, 46 insertions(+), 1 deletion(-)
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,10 +24,44 @@
#include <linux/uaccess.h>
+#include "internal.h"
+
static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;
/*
+ * User space expects anonymous inodes to have no file type in st_mode.
+ *
+ * In particular, 'lsof' has this legacy logic:
+ *
+ * type = s->st_mode & S_IFMT;
+ * switch (type) {
+ * ...
+ * case 0:
+ * if (!strcmp(p, "anon_inode"))
+ * Lf->ntype = Ntype = N_ANON_INODE;
+ *
+ * to detect our old anon_inode logic.
+ *
+ * Rather than mess with our internal sane inode data, just fix it
+ * up here in getattr() by masking off the format bits.
+ */
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ stat->mode &= ~S_IFMT;
+ return 0;
+}
+
+static const struct inode_operations anon_inode_operations = {
+ .getattr = anon_inode_getattr,
+};
+
+/*
* anon_inodefs_dname() is called from d_path().
*/
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -66,6 +100,7 @@ static struct inode *anon_inode_make_sec
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
+ inode->i_op = &anon_inode_operations;
error = security_inode_init_security_anon(inode, &QSTR(name),
context_inode);
if (error) {
@@ -313,6 +348,7 @@ static int __init anon_inode_init(void)
anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
if (IS_ERR(anon_inode_inode))
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
+ anon_inode_inode->i_op = &anon_inode_operations;
return 0;
}
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -343,3 +343,6 @@ static inline bool path_mounted(const st
void file_f_owner_release(struct file *file);
bool file_seek_cur_needs_f_lock(struct file *file);
int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags);
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1647,7 +1647,13 @@ struct inode *alloc_anon_inode(struct su
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
- inode->i_mode = S_IRUSR | S_IWUSR;
+ /*
+ * Historically anonymous inodes didn't have a type at all and
+ * userspace has come to rely on this. Internally they're just
+ * regular files but S_IFREG is masked off when reporting
+ * information to userspace.
+ */
+ inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;

View File

@@ -0,0 +1,80 @@
From ea4199112ae6d8da866417f50e035be01488c502 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Apr 2025 11:54:17 +0200
Subject: anon_inode: explicitly block ->setattr()
It is currently possible to change the mode and owner of the single
anonymous inode in the kernel:
int main(int argc, char *argv[])
{
int ret, sfd;
sigset_t mask;
struct signalfd_siginfo fdsi;
sigemptyset(&mask);
sigaddset(&mask, SIGINT);
sigaddset(&mask, SIGQUIT);
ret = sigprocmask(SIG_BLOCK, &mask, NULL);
if (ret < 0)
_exit(1);
sfd = signalfd(-1, &mask, 0);
if (sfd < 0)
_exit(2);
ret = fchown(sfd, 5555, 5555);
if (ret < 0)
_exit(3);
ret = fchmod(sfd, 0777);
if (ret < 0)
_exit(3);
_exit(4);
}
This is a bug. It's not really a meaningful one because anonymous inodes
don't really figure into path lookup and they cannot be reopened via
/proc/<pid>/fd/<nr> and can't be used for lookup itself. So they can
only ever serve as direct references.
But it is still completely bogus to allow the mode and ownership or any
of the properties of the anonymous inode to be changed. Block this!
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-3-53a44c20d44e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Cc: stable@vger.kernel.org # all LTS kernels
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/anon_inodes.c | 7 +++++++
fs/internal.h | 2 ++
2 files changed, 9 insertions(+)
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -57,8 +57,15 @@ int anon_inode_getattr(struct mnt_idmap
return 0;
}
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
static const struct inode_operations anon_inode_operations = {
.getattr = anon_inode_getattr,
+ .setattr = anon_inode_setattr,
};
/*
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -346,3 +346,5 @@ int statmount_mnt_idmap(struct mnt_idmap
int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags);
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr);

View File

@@ -1,35 +0,0 @@
From f762c206076d274ecb0e2f3d9b6cbca361ebb246 Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@natalenko.name>
Date: Thu, 1 May 2025 20:22:53 +0200
Subject: wifi: mac80211: mark copy_mesh_setup() as noinline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
With -O3 and GCC v15.1, the following happens:
```
In function fortify_memcpy_chk,
inlined from copy_mesh_setup at net/mac80211/cfg.c:2541:2,
inlined from ieee80211_join_mesh at net/mac80211/cfg.c:2694:8:
./include/linux/fortify-string.h:571:25: warning: call to __write_overflow_field declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning]
```
Maybe, it's time to abandon -O3 altogether?
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
---
net/mac80211/cfg.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2501,7 +2501,7 @@ static inline bool _chg_mesh_attr(enum n
return (mask >> (parm-1)) & 0x1;
}
-static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
+static noinline int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
const struct mesh_setup *setup)
{
u8 *new_ie;

View File

@@ -0,0 +1,39 @@
From 79f54c5bc7c6097a379c83e9ed56bee27cf1218a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Apr 2025 11:54:19 +0200
Subject: anon_inode: raise SB_I_NODEV and SB_I_NOEXEC
It isn't possible to execute anonymous inodes because they cannot be
opened in any way after they have been created. This includes execution:
execveat(fd_anon_inode, "", NULL, NULL, AT_EMPTY_PATH)
Anonymous inodes have inode->f_op set to no_open_fops which sets
no_open() which returns ENXIO. That means any call to do_dentry_open()
which is the endpoint of the do_open_execat() will fail. There's no
chance to execute an anonymous inode. Unless a given subsystem overrides
it ofc.
However, we should still harden this and raise SB_I_NODEV and
SB_I_NOEXEC on the superblock itself so that no one gets any creative
ideas.
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-5-53a44c20d44e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Cc: stable@vger.kernel.org # all LTS kernels
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/anon_inodes.c | 2 ++
1 file changed, 2 insertions(+)
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -86,6 +86,8 @@ static int anon_inodefs_init_fs_context(
struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
ctx->dops = &anon_inodefs_dentry_operations;
return 0;
}

View File

@@ -0,0 +1,136 @@
From edaacbee0f33b7371ec460723d1042a6c5a4bb9d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 21 Apr 2025 10:27:40 +0200
Subject: fs: add S_ANON_INODE
This makes it easy to detect proper anonymous inodes and to ensure that
we can detect them in codepaths such as readahead().
Readahead on anonymous inodes didn't work because they didn't have a
proper mode. Now that they have we need to retain EINVAL being returned
otherwise LTP will fail.
We also need to ensure that ioctls aren't simply fired like they are for
regular files so things like inotify inodes continue to correctly call
their own ioctl handlers as in [1].
Reported-by: Xilin Wu <sophon@radxa.com>
Link: https://lore.kernel.org/3A9139D5CD543962+89831381-31b9-4392-87ec-a84a5b3507d8@radxa.com [1]
Link: https://lore.kernel.org/7a1a7076-ff6b-4cb0-94e7-7218a0a44028@sirena.org.uk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/ioctl.c | 7 ++++---
fs/libfs.c | 2 +-
fs/pidfs.c | 2 +-
include/linux/fs.h | 2 ++
mm/readahead.c | 20 ++++++++++++++++----
5 files changed, 24 insertions(+), 9 deletions(-)
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -821,7 +821,8 @@ static int do_vfs_ioctl(struct file *fil
return ioctl_fioasync(fd, filp, argp);
case FIOQSIZE:
- if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
+ if (S_ISDIR(inode->i_mode) ||
+ (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) ||
S_ISLNK(inode->i_mode)) {
loff_t res = inode_get_bytes(inode);
return copy_to_user(argp, &res, sizeof(res)) ?
@@ -856,7 +857,7 @@ static int do_vfs_ioctl(struct file *fil
return ioctl_file_dedupe_range(filp, argp);
case FIONREAD:
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode))
return vfs_ioctl(filp, cmd, arg);
return put_user(i_size_read(inode) - filp->f_pos,
@@ -881,7 +882,7 @@ static int do_vfs_ioctl(struct file *fil
return ioctl_get_fs_sysfs_path(filp, argp);
default:
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode))
return file_ioctl(filp, cmd, argp);
break;
}
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1656,7 +1656,7 @@ struct inode *alloc_anon_inode(struct su
inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_flags |= S_PRIVATE;
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
simple_inode_init_ts(inode);
return inode;
}
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -826,7 +826,7 @@ static int pidfs_init_inode(struct inode
const struct pid *pid = data;
inode->i_private = data;
- inode->i_flags |= S_PRIVATE;
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2344,6 +2344,7 @@ struct super_operations {
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
+#define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2400,6 +2401,7 @@ static inline bool sb_rdonly(const struc
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
+#define IS_ANON_FILE(inode) ((inode)->i_flags & S_ANON_INODE)
static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
struct inode *inode)
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
+ struct file *file;
+ const struct inode *inode;
+
CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ file = fd_file(f);
+ if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t of
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+ if (!file->f_mapping)
+ return -EINVAL;
+ if (!file->f_mapping->a_ops)
+ return -EINVAL;
+
+ inode = file_inode(file);
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -EINVAL;
+ if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);

View File

@@ -0,0 +1,35 @@
From ab287d709809b6dfe4d3c42016a543d976533d51 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Wed, 7 May 2025 19:50:26 +0800
Subject: configfs: Do not override creating attribute file failure in
populate_attrs()
populate_attrs() may override failure for creating attribute files
by success for creating subsequent bin attribute files, and have
wrong return value.
Fix by creating bin attribute files under successfully creating
attribute files.
Fixes: 03607ace807b ("configfs: implement binary attributes")
Cc: stable@vger.kernel.org
Reviewed-by: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Link: https://lore.kernel.org/r/20250507-fix_configfs-v3-2-fe2d96de8dc4@quicinc.com
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
---
fs/configfs/dir.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -619,7 +619,7 @@ static int populate_attrs(struct config_
break;
}
}
- if (t->ct_bin_attrs) {
+ if (!error && t->ct_bin_attrs) {
for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i))
continue;

View File

@@ -0,0 +1,104 @@
From 896b7b0d6ed53a7fe159c4b76f25407c816aa619 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 23 May 2025 19:20:36 -0400
Subject: Don't propagate mounts into detached trees
All versions up to 6.14 did not propagate mount events into detached
tree. Shortly after 6.14 a merge of vfs-6.15-rc1.mount.namespace
(130e696aa68b) has changed that.
Unfortunately, that has caused userland regressions (reported in
https://lore.kernel.org/all/CAOYeF9WQhFDe+BGW=Dp5fK8oRy5AgZ6zokVyTj1Wp4EUiYgt4w@mail.gmail.com/)
Straight revert wouldn't be an option - in particular, the variant in 6.14
had a bug that got fixed in d1ddc6f1d9f0 ("fix IS_MNT_PROPAGATING uses")
and we don't want to bring the bug back.
This is a modification of manual revert posted by Christian, with changes
needed to avoid reintroducing the breakage in scenario described in
d1ddc6f1d9f0.
Cc: stable@vger.kernel.org
Reported-by: Allison Karlitskaya <lis@redhat.com>
Tested-by: Allison Karlitskaya <lis@redhat.com>
Acked-by: Christian Brauner <brauner@kernel.org>
Co-developed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/mount.h | 5 -----
fs/namespace.c | 15 ++-------------
fs/pnode.c | 4 ++--
3 files changed, 4 insertions(+), 20 deletions(-)
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -7,10 +7,6 @@
extern struct list_head notify_list;
-typedef __u32 __bitwise mntns_flags_t;
-
-#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0))
-
struct mnt_namespace {
struct ns_common ns;
struct mount * root;
@@ -37,7 +33,6 @@ struct mnt_namespace {
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */
- mntns_flags_t mntns_flags;
} __randomize_layout;
struct mnt_pcp {
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3648,7 +3648,7 @@ static int do_move_mount(struct path *ol
if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
goto out;
- if (is_anon_ns(ns)) {
+ if (is_anon_ns(ns) && ns == p->mnt_ns) {
/*
* Ending up with two files referring to the root of the
* same anonymous mount namespace would cause an error
@@ -3656,16 +3656,7 @@ static int do_move_mount(struct path *ol
* twice into the mount tree which would be rejected
* later. But be explicit about it right here.
*/
- if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns))
- goto out;
-
- /*
- * If this is an anonymous mount tree ensure that mount
- * propagation can detect mounts that were just
- * propagated to the target mount tree so we don't
- * propagate onto them.
- */
- ns->mntns_flags |= MNTNS_PROPAGATING;
+ goto out;
} else if (is_anon_ns(p->mnt_ns)) {
/*
* Don't allow moving an attached mount tree to an
@@ -3722,8 +3713,6 @@ static int do_move_mount(struct path *ol
if (attached)
put_mountpoint(old_mp);
out:
- if (is_anon_ns(ns))
- ns->mntns_flags &= ~MNTNS_PROPAGATING;
unlock_mount(mp);
if (!err) {
if (attached) {
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -231,8 +231,8 @@ static int propagate_one(struct mount *m
/* skip if mountpoint isn't visible in m */
if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
return 0;
- /* skip if m is in the anon_ns we are emptying */
- if (m->mnt_ns->mntns_flags & MNTNS_PROPAGATING)
+ /* skip if m is in the anon_ns */
+ if (is_anon_ns(m->mnt_ns))
return 0;
if (peers(m, last_dest)) {

View File

@@ -0,0 +1,51 @@
From bc86aaf0e0256220ca787fdbb57a73429ade1129 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 May 2025 07:28:52 -0600
Subject: mm/filemap: gate dropbehind invalidate on folio !dirty && !writeback
It's possible for the folio to either get marked for writeback or
redirtied. Add a helper, filemap_end_dropbehind(), which guards the
folio_unmap_invalidate() call behind check for the folio being both
non-dirty and not under writeback AFTER the folio lock has been
acquired. Use this helper folio_end_dropbehind_write().
Cc: stable@vger.kernel.org
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Fixes: fb7d3bc41493 ("mm/filemap: drop streaming/uncached pages when writeback completes")
Link: https://lore.kernel.org/linux-fsdevel/20250525083209.GS2023217@ZenIV/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/20250527133255.452431-2-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
mm/filemap.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1589,6 +1589,16 @@ int folio_wait_private_2_killable(struct
}
EXPORT_SYMBOL(folio_wait_private_2_killable);
+static void filemap_end_dropbehind(struct folio *folio)
+{
+ struct address_space *mapping = folio->mapping;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (mapping && !folio_test_writeback(folio) && !folio_test_dirty(folio))
+ folio_unmap_invalidate(mapping, folio, 0);
+}
+
/*
* If folio was marked as dropbehind, then pages should be dropped when writeback
* completes. Do that now. If we fail, it's likely because of a big folio -
@@ -1604,8 +1614,7 @@ static void folio_end_dropbehind_write(s
* invalidation in that case.
*/
if (in_task() && folio_trylock(folio)) {
- if (folio->mapping)
- folio_unmap_invalidate(folio->mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}

View File

@@ -0,0 +1,51 @@
From fad76185ca91983990c660642151083eb05cbfc0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 May 2025 07:28:53 -0600
Subject: mm/filemap: use filemap_end_dropbehind() for read invalidation
Use the filemap_end_dropbehind() helper rather than calling
folio_unmap_invalidate() directly, as we need to check if the folio has
been redirtied or marked for writeback once the folio lock has been
re-acquired.
Cc: stable@vger.kernel.org
Reported-by: Trond Myklebust <trondmy@hammerspace.com>
Fixes: 8026e49bff9b ("mm/filemap: add read support for RWF_DONTCACHE")
Link: https://lore.kernel.org/linux-fsdevel/ba8a9805331ce258a622feaca266b163db681a10.camel@hammerspace.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/20250527133255.452431-3-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
mm/filemap.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2644,8 +2644,7 @@ static inline bool pos_same_folio(loff_t
return (pos1 >> shift == pos2 >> shift);
}
-static void filemap_end_dropbehind_read(struct address_space *mapping,
- struct folio *folio)
+static void filemap_end_dropbehind_read(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
return;
@@ -2653,7 +2652,7 @@ static void filemap_end_dropbehind_read(
return;
if (folio_trylock(folio)) {
if (folio_test_clear_dropbehind(folio))
- folio_unmap_invalidate(mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -2774,7 +2773,7 @@ put_folios:
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- filemap_end_dropbehind_read(mapping, folio);
+ filemap_end_dropbehind_read(folio);
folio_put(folio);
}
folio_batch_init(&fbatch);

View File

@@ -0,0 +1,29 @@
From f0579d45f2e03fa3ba0d9466e79a31ea37acb487 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 May 2025 07:28:54 -0600
Subject: Revert "Disable FOP_DONTCACHE for now due to bugs"
This reverts commit 478ad02d6844217cc7568619aeb0809d93ade43d.
Both the read and write side dirty && writeback races should be resolved
now, revert the commit that disabled FOP_DONTCACHE for filesystems.
Link: https://lore.kernel.org/linux-fsdevel/20250525083209.GS2023217@ZenIV/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/20250527133255.452431-4-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
include/linux/fs.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2186,7 +2186,7 @@ struct file_operations {
/* Supports asynchronous lock callbacks */
#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6))
/* File system supports uncached read/write buffered IO */
-#define FOP_DONTCACHE 0 /* ((__force fop_flags_t)(1 << 7)) */
+#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7))
/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,

View File

@@ -0,0 +1,36 @@
From 3b4614564770691cf3a6eb88127268ef6a84180c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 May 2025 07:28:55 -0600
Subject: mm/filemap: unify read/write dropbehind naming
The read side is filemap_end_dropbehind_read(), while the write side
used folio_ as the prefix rather than filemap_. The read side makes more
sense, unify the naming such that the write side follows that.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/20250527133255.452431-5-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
mm/filemap.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1604,7 +1604,7 @@ static void filemap_end_dropbehind(struc
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
*/
-static void folio_end_dropbehind_write(struct folio *folio)
+static void filemap_end_dropbehind_write(struct folio *folio)
{
/*
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
@@ -1659,7 +1659,7 @@ void folio_end_writeback(struct folio *f
acct_reclaim_writeback(folio);
if (folio_dropbehind)
- folio_end_dropbehind_write(folio);
+ filemap_end_dropbehind_write(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);

View File

@@ -0,0 +1,78 @@
From 6003153e1bc4ad4952773081d7b89aa1ab2274c3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 May 2025 07:28:56 -0600
Subject: mm/filemap: unify dropbehind flag testing and clearing
The read and write side does this a bit differently, unify it such that
the _{read,write} helpers check the bit before locking, and the generic
handler is in charge of clearing the bit and invalidating, once under
the folio lock.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/20250527133255.452431-6-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
mm/filemap.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1595,7 +1595,11 @@ static void filemap_end_dropbehind(struc
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- if (mapping && !folio_test_writeback(folio) && !folio_test_dirty(folio))
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (!folio_test_clear_dropbehind(folio))
+ return;
+ if (mapping)
folio_unmap_invalidate(mapping, folio, 0);
}
@@ -1606,6 +1610,9 @@ static void filemap_end_dropbehind(struc
*/
static void filemap_end_dropbehind_write(struct folio *folio)
{
+ if (!folio_test_dropbehind(folio))
+ return;
+
/*
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
* but can happen if normal writeback just happens to find dirty folios
@@ -1629,8 +1636,6 @@ static void filemap_end_dropbehind_write
*/
void folio_end_writeback(struct folio *folio)
{
- bool folio_dropbehind = false;
-
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
/*
@@ -1652,14 +1657,11 @@ void folio_end_writeback(struct folio *f
* reused before the folio_wake_bit().
*/
folio_get(folio);
- if (!folio_test_dirty(folio))
- folio_dropbehind = folio_test_clear_dropbehind(folio);
if (__folio_end_writeback(folio))
folio_wake_bit(folio, PG_writeback);
- acct_reclaim_writeback(folio);
- if (folio_dropbehind)
- filemap_end_dropbehind_write(folio);
+ filemap_end_dropbehind_write(folio);
+ acct_reclaim_writeback(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);
@@ -2651,8 +2653,7 @@ static void filemap_end_dropbehind_read(
if (folio_test_writeback(folio) || folio_test_dirty(folio))
return;
if (folio_trylock(folio)) {
- if (folio_test_clear_dropbehind(folio))
- filemap_end_dropbehind(folio);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}

View File

@@ -0,0 +1,98 @@
From 61c0b2450f2b85c5053fa4f71d9c619b34d3af6c Mon Sep 17 00:00:00 2001
From: Shivank Garg <shivankg@amd.com>
Date: Mon, 26 May 2025 18:28:18 +0000
Subject: mm/khugepaged: fix race with folio split/free using temporary
reference
hpage_collapse_scan_file() calls is_refcount_suitable(), which in turn
calls folio_mapcount(). folio_mapcount() checks folio_test_large() before
proceeding to folio_large_mapcount(), but there is a race window where the
folio may get split/freed between these checks, triggering:
VM_WARN_ON_FOLIO(!folio_test_large(folio), folio)
Take a temporary reference to the folio in hpage_collapse_scan_file().
This stabilizes the folio during refcount check and prevents incorrect
large folio detection due to concurrent split/free. Use helper
folio_expected_ref_count() + 1 to compare with folio_ref_count() instead
of using is_refcount_suitable().
Link: https://lkml.kernel.org/r/20250526182818.37978-1-shivankg@amd.com
Fixes: 05c5323b2a34 ("mm: track mapcount of large folios in single value")
Signed-off-by: Shivank Garg <shivankg@amd.com>
Reported-by: syzbot+2b99589e33edbe9475ca@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/6828470d.a70a0220.38f255.000c.GAE@google.com
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Fengwei Yin <fengwei.yin@intel.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mariano Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/khugepaged.c | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2295,6 +2295,17 @@ static int hpage_collapse_scan_file(stru
continue;
}
+ if (!folio_try_get(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ xas_reset(&xas);
+ continue;
+ }
+
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@@ -2305,23 +2316,27 @@ static int hpage_collapse_scan_file(stru
* it's safe to skip LRU and refcount checks before
* returning.
*/
+ folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
+ folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
+ folio_put(folio);
break;
}
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
+ folio_put(folio);
break;
}
@@ -2333,6 +2348,7 @@ static int hpage_collapse_scan_file(stru
*/
present += folio_nr_pages(folio);
+ folio_put(folio);
if (need_resched()) {
xas_pause(&xas);

View File

@@ -0,0 +1,198 @@
From 214092002cbd9945b7cc6314e76ec42b3f588c01 Mon Sep 17 00:00:00 2001
From: Shivank Garg <shivankg@amd.com>
Date: Wed, 30 Apr 2025 10:01:51 +0000
Subject: mm: add folio_expected_ref_count() for reference count calculation
Patch series " JFS: Implement migrate_folio for jfs_metapage_aops" v5.
This patchset addresses a warning that occurs during memory compaction due
to JFS's missing migrate_folio operation. The warning was introduced by
commit 7ee3647243e5 ("migrate: Remove call to ->writepage") which added
explicit warnings when filesystem don't implement migrate_folio.
The syzbot reported following [1]:
jfs_metapage_aops does not implement migrate_folio
WARNING: CPU: 1 PID: 5861 at mm/migrate.c:955 fallback_migrate_folio mm/migrate.c:953 [inline]
WARNING: CPU: 1 PID: 5861 at mm/migrate.c:955 move_to_new_folio+0x70e/0x840 mm/migrate.c:1007
Modules linked in:
CPU: 1 UID: 0 PID: 5861 Comm: syz-executor280 Not tainted 6.15.0-rc1-next-20250411-syzkaller #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025
RIP: 0010:fallback_migrate_folio mm/migrate.c:953 [inline]
RIP: 0010:move_to_new_folio+0x70e/0x840 mm/migrate.c:1007
To fix this issue, this series implement metapage_migrate_folio() for JFS
which handles both single and multiple metapages per page configurations.
While most filesystems leverage existing migration implementations like
filemap_migrate_folio(), buffer_migrate_folio_norefs() or
buffer_migrate_folio() (which internally used folio_expected_refs()),
JFS's metapage architecture requires special handling of its private data
during migration. To support this, this series introduce the
folio_expected_ref_count(), which calculates external references to a
folio from page/swap cache, private data, and page table mappings.
This standardized implementation replaces the previous ad-hoc
folio_expected_refs() function and enables JFS to accurately determine
whether a folio has unexpected references before attempting migration.
Implement folio_expected_ref_count() to calculate expected folio reference
counts from:
- Page/swap cache (1 per page)
- Private data (1)
- Page table mappings (1 per map)
While originally needed for page migration operations, this improved
implementation standardizes reference counting by consolidating all
refcount contributors into a single, reusable function that can benefit
any subsystem needing to detect unexpected references to folios.
The folio_expected_ref_count() returns the sum of these external
references without including any reference the caller itself might hold.
Callers comparing against the actual folio_ref_count() must account for
their own references separately.
Link: https://syzkaller.appspot.com/bug?extid=8bb6fd945af4e0ad9299 [1]
Link: https://lkml.kernel.org/r/20250430100150.279751-1-shivankg@amd.com
Link: https://lkml.kernel.org/r/20250430100150.279751-2-shivankg@amd.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Co-developed-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Dave Kleikamp <shaggy@kernel.org>
Cc: Donet Tom <donettom@linux.ibm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/mm.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
mm/migrate.c | 22 ++++---------------
2 files changed, 59 insertions(+), 18 deletions(-)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2307,6 +2307,61 @@ static inline bool folio_maybe_mapped_sh
return folio_test_large_maybe_mapped_shared(folio);
}
+/**
+ * folio_expected_ref_count - calculate the expected folio refcount
+ * @folio: the folio
+ *
+ * Calculate the expected folio refcount, taking references from the pagecache,
+ * swapcache, PG_private and page table mappings into account. Useful in
+ * combination with folio_ref_count() to detect unexpected references (e.g.,
+ * GUP or other temporary references).
+ *
+ * Does currently not consider references from the LRU cache. If the folio
+ * was isolated from the LRU (which is the case during migration or split),
+ * the LRU cache does not apply.
+ *
+ * Calling this function on an unmapped folio -- !folio_mapped() -- that is
+ * locked will return a stable result.
+ *
+ * Calling this function on a mapped folio will not result in a stable result,
+ * because nothing stops additional page table mappings from coming (e.g.,
+ * fork()) or going (e.g., munmap()).
+ *
+ * Calling this function without the folio lock will also not result in a
+ * stable result: for example, the folio might get dropped from the swapcache
+ * concurrently.
+ *
+ * However, even when called without the folio lock or on a mapped folio,
+ * this function can be used to detect unexpected references early (for example,
+ * if it makes sense to even lock the folio and unmap it).
+ *
+ * The caller must add any reference (e.g., from folio_try_get()) it might be
+ * holding itself to the result.
+ *
+ * Returns the expected folio refcount.
+ */
+static inline int folio_expected_ref_count(const struct folio *folio)
+{
+ const int order = folio_order(folio);
+ int ref_count = 0;
+
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return 0;
+
+ if (folio_test_anon(folio)) {
+ /* One reference per page from the swapcache. */
+ ref_count += folio_test_swapcache(folio) << order;
+ } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) {
+ /* One reference per page from the pagecache. */
+ ref_count += !!folio->mapping << order;
+ /* One reference from PG_private. */
+ ref_count += folio_test_private(folio);
+ }
+
+ /* One reference per page table mapping. */
+ return ref_count + folio_mapcount(folio);
+}
+
#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
static inline int arch_make_folio_accessible(struct folio *folio)
{
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -445,20 +445,6 @@ unlock:
}
#endif
-static int folio_expected_refs(struct address_space *mapping,
- struct folio *folio)
-{
- int refs = 1;
- if (!mapping)
- return refs;
-
- refs += folio_nr_pages(folio);
- if (folio_test_private(folio))
- refs++;
-
- return refs;
-}
-
/*
* Replace the folio in the mapping.
*
@@ -601,7 +587,7 @@ static int __folio_migrate_mapping(struc
int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count)
{
- int expected_count = folio_expected_refs(mapping, folio) + extra_count;
+ int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
if (folio_ref_count(folio) != expected_count)
return -EAGAIN;
@@ -618,7 +604,7 @@ int migrate_huge_page_move_mapping(struc
struct folio *dst, struct folio *src)
{
XA_STATE(xas, &mapping->i_pages, folio_index(src));
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;
@@ -749,7 +735,7 @@ static int __migrate_folio(struct addres
struct folio *src, void *src_private,
enum migrate_mode mode)
{
- int rc, expected_count = folio_expected_refs(mapping, src);
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
/* Check whether src does not have extra refs before we do more work */
if (folio_ref_count(src) != expected_count)
@@ -837,7 +823,7 @@ static int __buffer_migrate_folio(struct
return migrate_folio(mapping, dst, src, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = folio_expected_refs(mapping, src);
+ expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
return -EAGAIN;

View File

@@ -0,0 +1,129 @@
From 0f52f05148589fe4115322a9cc8ffab760091a0a Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 29 May 2025 15:56:47 +0000
Subject: mm: fix uprobe pte be overwritten when expanding vma
Patch series "Fix uprobe pte be overwritten when expanding vma".
This patch (of 4):
We encountered a BUG alert triggered by Syzkaller as follows:
BUG: Bad rss-counter state mm:00000000b4a60fca type:MM_ANONPAGES val:1
And we can reproduce it with the following steps:
1. register uprobe on file at zero offset
2. mmap the file at zero offset:
addr1 = mmap(NULL, 2 * 4096, PROT_NONE, MAP_PRIVATE, fd, 0);
3. mremap part of vma1 to new vma2:
addr2 = mremap(addr1, 4096, 2 * 4096, MREMAP_MAYMOVE);
4. mremap back to orig addr1:
mremap(addr2, 4096, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, addr1);
In step 3, the vma1 range [addr1, addr1 + 4096] will be remap to new vma2
with range [addr2, addr2 + 8192], and remap uprobe anon page from the vma1
to vma2, then unmap the vma1 range [addr1, addr1 + 4096].
In step 4, the vma2 range [addr2, addr2 + 4096] will be remap back to the
addr range [addr1, addr1 + 4096]. Since the addr range [addr1 + 4096,
addr1 + 8192] still maps the file, it will take vma_merge_new_range to
expand the range, and then do uprobe_mmap in vma_complete. Since the
merged vma pgoff is also zero offset, it will install uprobe anon page to
the merged vma. However, the upcomming move_page_tables step, which use
set_pte_at to remap the vma2 uprobe pte to the merged vma, will overwrite
the newly uprobe pte in the merged vma, and lead that pte to be orphan.
Since the uprobe pte will be remapped to the merged vma, we can remove the
unnecessary uprobe_mmap upon merged vma.
This problem was first found in linux-6.6.y and also exists in the
community syzkaller:
https://lore.kernel.org/all/000000000000ada39605a5e71711@google.com/T/
Link: https://lkml.kernel.org/r/20250529155650.4017699-1-pulehui@huaweicloud.com
Link: https://lkml.kernel.org/r/20250529155650.4017699-2-pulehui@huaweicloud.com
Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/vma.c | 20 +++++++++++++++++---
mm/vma.h | 7 +++++++
2 files changed, 24 insertions(+), 3 deletions(-)
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -144,6 +144,9 @@ static void init_multi_vma_prep(struct v
vp->file = vma->vm_file;
if (vp->file)
vp->mapping = vma->vm_file->f_mapping;
+
+ if (vmg && vmg->skip_vma_uprobe)
+ vp->skip_vma_uprobe = true;
}
/*
@@ -333,10 +336,13 @@ static void vma_complete(struct vma_prep
if (vp->file) {
i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
+ if (!vp->skip_vma_uprobe) {
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
}
if (vp->remove) {
@@ -1783,6 +1789,14 @@ struct vm_area_struct *copy_vma(struct v
faulted_in_anon_vma = false;
}
+ /*
+ * If the VMA we are copying might contain a uprobe PTE, ensure
+ * that we do not establish one upon merge. Otherwise, when mremap()
+ * moves page tables, it will orphan the newly created PTE.
+ */
+ if (vma->vm_file)
+ vmg.skip_vma_uprobe = true;
+
new_vma = find_vma_prev(mm, addr, &vmg.prev);
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -19,6 +19,8 @@ struct vma_prepare {
struct vm_area_struct *insert;
struct vm_area_struct *remove;
struct vm_area_struct *remove2;
+
+ bool skip_vma_uprobe :1;
};
struct unlink_vma_file_batch {
@@ -120,6 +122,11 @@ struct vma_merge_struct {
*/
bool give_up_on_oom :1;
+ /*
+ * If set, skip uprobe_mmap upon merged vma.
+ */
+ bool skip_vma_uprobe :1;
+
/* Internal flags set during merge process: */
/*

View File

@@ -0,0 +1,217 @@
From 6f1e03b94f7777323aaefd9286d992a1cbd0adf7 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 27 May 2025 23:23:53 +0200
Subject: mm/hugetlb: unshare page tables during VMA split, not before
Currently, __split_vma() triggers hugetlb page table unsharing through
vm_ops->may_split(). This happens before the VMA lock and rmap locks are
taken - which is too early, it allows racing VMA-locked page faults in our
process and racing rmap walks from other processes to cause page tables to
be shared again before we actually perform the split.
Fix it by explicitly calling into the hugetlb unshare logic from
__split_vma() in the same place where THP splitting also happens. At that
point, both the VMA and the rmap(s) are write-locked.
An annoying detail is that we can now call into the helper
hugetlb_unshare_pmds() from two different locking contexts:
1. from hugetlb_split(), holding:
- mmap lock (exclusively)
- VMA lock
- file rmap lock (exclusively)
2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
call us with only the mmap lock held (in shared mode), but currently
only runs while holding mmap lock (exclusively) and VMA lock
Backporting note:
This commit fixes a racy protection that was introduced in commit
b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
commit claimed to fix an issue introduced in 5.13, but it should actually
also go all the way back.
[jannh@google.com: v2]
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org> [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/hugetlb.h | 3 ++
mm/hugetlb.c | 60 +++++++++++++++++++++++---------
mm/vma.c | 7 ++++
tools/testing/vma/vma_internal.h | 2 ++
4 files changed, 56 insertions(+), 16 deletions(-)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -276,6 +276,7 @@ bool is_hugetlb_entry_migration(pte_t pt
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -473,6 +474,8 @@ static inline void fixup_hugetlb_reserva
{
}
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
+
#endif /* !CONFIG_HUGETLB_PAGE */
#ifndef pgd_write
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -120,7 +120,7 @@ static void hugetlb_vma_lock_free(struct
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
@@ -5426,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+ return 0;
+}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ * This function is called in the middle of a VMA split operation, with
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
+ * walks (except hardware and gup_fast()).
*/
+ vma_assert_write_locked(vma);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
if (addr & ~PUD_MASK) {
- /*
- * hugetlb_vm_op_split is called right before we attempt to
- * split the VMA. We will need to unshare PMDs in the old and
- * new VMAs, so let's unshare before we split.
- */
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
- hugetlb_unshare_pmds(vma, floor, ceil);
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+ /*
+ * Locking:
+ * Use take_locks=false here.
+ * The file rmap lock is already held.
+ * The hugetlb VMA lock can't be taken when we already
+ * hold the file rmap lock, and we don't need it because
+ * its purpose is to synchronize against concurrent page
+ * table walks, which are not possible thanks to the
+ * locks held by our caller.
+ */
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+ }
}
-
- return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7884,9 +7898,16 @@ void move_hugetlb_state(struct folio *ol
spin_unlock_irq(&hugetlb_lock);
}
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -7910,8 +7931,12 @@ static void hugetlb_unshare_pmds(struct
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
- hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (take_locks) {
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ }
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
@@ -7921,8 +7946,10 @@ static void hugetlb_unshare_pmds(struct
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- hugetlb_vma_unlock_write(vma);
+ if (take_locks) {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7937,7 +7964,8 @@ static void hugetlb_unshare_pmds(struct
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+ /* take_locks = */ true);
}
/*
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -516,7 +516,14 @@ __split_vma(struct vma_iterator *vmi, st
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -793,6 +793,8 @@ static inline void vma_adjust_trans_huge
(void)next;
}
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
+
static inline void vma_iter_free(struct vma_iterator *vmi)
{
mas_destroy(&vmi->mas);

View File

@@ -0,0 +1,50 @@
From cbd0e47470ea4db11acf3612edf91b5047a90d24 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 27 May 2025 23:23:54 +0200
Subject: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race
huge_pmd_unshare() drops a reference on a page table that may have
previously been shared across processes, potentially turning it into a
normal page table used in another process in which unrelated VMAs can
afterwards be installed.
If this happens in the middle of a concurrent gup_fast(), gup_fast() could
end up walking the page tables of another process. While I don't see any
way in which that immediately leads to kernel memory corruption, it is
really weird and unexpected.
Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(),
just like we do in khugepaged when removing page tables for a THP
collapse.
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-2-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-2-f4136f5ec58a@google.com
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/hugetlb.c | 7 +++++++
1 file changed, 7 insertions(+)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7628,6 +7628,13 @@ int huge_pmd_unshare(struct mm_struct *m
return 0;
pud_clear(pud);
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;

View File

@@ -0,0 +1,48 @@
From cb42e10062f07934d60ce2a9bc154ea7ac0bab5a Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 2 Jun 2025 10:49:26 -0700
Subject: mm/madvise: handle madvise_lock() failure during race unwinding
When unwinding race on -ERESTARTNOINTR handling of process_madvise(),
madvise_lock() failure is ignored. Check the failure and abort remaining
works in the case.
Link: https://lkml.kernel.org/r/20250602174926.1074-1-sj@kernel.org
Fixes: 4000e3d0a367 ("mm/madvise: remove redundant mmap_lock operations from process_madvise()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Barry Song <21cnbao@gmail.com>
Closes: https://lore.kernel.org/CAGsJ_4xJXXO0G+4BizhohSZ4yDteziPw43_uF8nPXPWxUVChzw@mail.gmail.com
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Barry Song <baohua@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/madvise.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1830,7 +1830,9 @@ static ssize_t vector_madvise(struct mm_
/* Drop and reacquire lock to unwind race. */
madvise_unlock(mm, behavior);
- madvise_lock(mm, behavior);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ goto out;
continue;
}
if (ret < 0)
@@ -1839,6 +1841,7 @@ static ssize_t vector_madvise(struct mm_
}
madvise_unlock(mm, behavior);
+out:
ret = (total_len - iov_iter_count(iter)) ? : ret;
return ret;

View File

@@ -0,0 +1,164 @@
From 0aeb6f83ff11709bb4b6fc9afa2f742681ca36e1 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 28 May 2025 10:02:08 +0200
Subject: video: screen_info: Relocate framebuffers behind PCI bridges
Apply PCI host-bridge window offsets to screen_info framebuffers. Fixes
invalid access to I/O memory.
Resources behind a PCI host bridge can be relocated by a certain offset
in the kernel's CPU address range used for I/O. The framebuffer memory
range stored in screen_info refers to the CPU addresses as seen during
boot (where the offset is 0). During boot up, firmware may assign a
different memory offset to the PCI host bridge and thereby relocating
the framebuffer address of the PCI graphics device as seen by the kernel.
The information in screen_info must be updated as well.
The helper pcibios_bus_to_resource() performs the relocation of the
screen_info's framebuffer resource (given in PCI bus addresses). The
result matches the I/O-memory resource of the PCI graphics device (given
in CPU addresses). As before, we store away the information necessary to
later update the information in screen_info itself.
Commit 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated
EFI framebuffers") added the code for updating screen_info. It is based
on similar functionality that pre-existed in efifb. Efifb uses a pointer
to the PCI resource, while the newer code does a memcpy of the region.
Hence efifb sees any updates to the PCI resource and avoids the issue.
v3:
- Only use struct pci_bus_region for PCI bus addresses (Bjorn)
- Clarify address semantics in commit messages and comments (Bjorn)
v2:
- Fixed tags (Takashi, Ivan)
- Updated information on efifb
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Reported-by: "Ivan T. Ivanov" <iivanov@suse.de>
Closes: https://bugzilla.suse.com/show_bug.cgi?id=1240696
Tested-by: "Ivan T. Ivanov" <iivanov@suse.de>
Fixes: 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated EFI framebuffers")
Cc: dri-devel@lists.freedesktop.org
Cc: <stable@vger.kernel.org> # v6.9+
Link: https://lore.kernel.org/r/20250528080234.7380-1-tzimmermann@suse.de
---
drivers/video/screen_info_pci.c | 79 +++++++++++++++++++++------------
1 file changed, 50 insertions(+), 29 deletions(-)
--- a/drivers/video/screen_info_pci.c
+++ b/drivers/video/screen_info_pci.c
@@ -7,8 +7,8 @@
static struct pci_dev *screen_info_lfb_pdev;
static size_t screen_info_lfb_bar;
-static resource_size_t screen_info_lfb_offset;
-static struct resource screen_info_lfb_res = DEFINE_RES_MEM(0, 0);
+static resource_size_t screen_info_lfb_res_start; // original start of resource
+static resource_size_t screen_info_lfb_offset; // framebuffer offset within resource
static bool __screen_info_relocation_is_valid(const struct screen_info *si, struct resource *pr)
{
@@ -31,7 +31,7 @@ void screen_info_apply_fixups(void)
if (screen_info_lfb_pdev) {
struct resource *pr = &screen_info_lfb_pdev->resource[screen_info_lfb_bar];
- if (pr->start != screen_info_lfb_res.start) {
+ if (pr->start != screen_info_lfb_res_start) {
if (__screen_info_relocation_is_valid(si, pr)) {
/*
* Only update base if we have an actual
@@ -47,46 +47,67 @@ void screen_info_apply_fixups(void)
}
}
+static int __screen_info_lfb_pci_bus_region(const struct screen_info *si, unsigned int type,
+ struct pci_bus_region *r)
+{
+ u64 base, size;
+
+ base = __screen_info_lfb_base(si);
+ if (!base)
+ return -EINVAL;
+
+ size = __screen_info_lfb_size(si, type);
+ if (!size)
+ return -EINVAL;
+
+ r->start = base;
+ r->end = base + size - 1;
+
+ return 0;
+}
+
static void screen_info_fixup_lfb(struct pci_dev *pdev)
{
unsigned int type;
- struct resource res[SCREEN_INFO_MAX_RESOURCES];
- size_t i, numres;
+ struct pci_bus_region bus_region;
int ret;
+ struct resource r = {
+ .flags = IORESOURCE_MEM,
+ };
+ const struct resource *pr;
const struct screen_info *si = &screen_info;
if (screen_info_lfb_pdev)
return; // already found
type = screen_info_video_type(si);
- if (type != VIDEO_TYPE_EFI)
- return; // only applies to EFI
+ if (!__screen_info_has_lfb(type))
+ return; // only applies to EFI; maybe VESA
- ret = screen_info_resources(si, res, ARRAY_SIZE(res));
+ ret = __screen_info_lfb_pci_bus_region(si, type, &bus_region);
if (ret < 0)
return;
- numres = ret;
- for (i = 0; i < numres; ++i) {
- struct resource *r = &res[i];
- const struct resource *pr;
-
- if (!(r->flags & IORESOURCE_MEM))
- continue;
- pr = pci_find_resource(pdev, r);
- if (!pr)
- continue;
-
- /*
- * We've found a PCI device with the framebuffer
- * resource. Store away the parameters to track
- * relocation of the framebuffer aperture.
- */
- screen_info_lfb_pdev = pdev;
- screen_info_lfb_bar = pr - pdev->resource;
- screen_info_lfb_offset = r->start - pr->start;
- memcpy(&screen_info_lfb_res, r, sizeof(screen_info_lfb_res));
- }
+ /*
+ * Translate the PCI bus address to resource. Account
+ * for an offset if the framebuffer is behind a PCI host
+ * bridge.
+ */
+ pcibios_bus_to_resource(pdev->bus, &r, &bus_region);
+
+ pr = pci_find_resource(pdev, &r);
+ if (!pr)
+ return;
+
+ /*
+ * We've found a PCI device with the framebuffer
+ * resource. Store away the parameters to track
+ * relocation of the framebuffer aperture.
+ */
+ screen_info_lfb_pdev = pdev;
+ screen_info_lfb_bar = pr - pdev->resource;
+ screen_info_lfb_offset = r.start - pr->start;
+ screen_info_lfb_res_start = bus_region.start;
}
DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID, PCI_BASE_CLASS_DISPLAY, 16,
screen_info_fixup_lfb);

View File

@@ -0,0 +1,86 @@
From 06ff725d11ea8713876187973c834fb595cb26f1 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 3 Jun 2025 17:48:20 +0200
Subject: sysfb: Fix screen_info type check for VGA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Use the helper screen_info_video_type() to get the framebuffer
type from struct screen_info. Handle supported values in sorted
switch statement.
Reading orig_video_isVGA is unreliable. On most systems it is a
VIDEO_TYPE_ constant. On some systems with VGA it is simply set
to 1 to signal the presence of a VGA output. See vga_probe() for
an example. Retrieving the screen_info type with the helper
screen_info_video_type() detects these cases and returns the
appropriate VIDEO_TYPE_ constant. For VGA, sysfb creates a device
named "vga-framebuffer".
The sysfb code has been taken from vga16fb, where it likely didn't
work correctly either. With this bugfix applied, vga16fb loads for
compatible vga-framebuffer devices.
Fixes: 0db5b61e0dc0 ("fbdev/vga16fb: Create EGA/VGA devices in sysfb code")
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Javier Martinez Canillas <javierm@redhat.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Tzung-Bi Shih <tzungbi@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: "Uwe Kleine-König" <u.kleine-koenig@baylibre.com>
Cc: Zsolt Kajtar <soci@c64.rulez.org>
Cc: <stable@vger.kernel.org> # v6.1+
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://lore.kernel.org/r/20250603154838.401882-1-tzimmermann@suse.de
---
drivers/firmware/sysfb.c | 26 ++++++++++++++++++--------
1 file changed, 18 insertions(+), 8 deletions(-)
--- a/drivers/firmware/sysfb.c
+++ b/drivers/firmware/sysfb.c
@@ -143,6 +143,7 @@ static __init int sysfb_init(void)
{
struct screen_info *si = &screen_info;
struct device *parent;
+ unsigned int type;
struct simplefb_platform_data mode;
const char *name;
bool compatible;
@@ -170,17 +171,26 @@ static __init int sysfb_init(void)
goto put_device;
}
+ type = screen_info_video_type(si);
+
/* if the FB is incompatible, create a legacy framebuffer device */
- if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
- name = "efi-framebuffer";
- else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
- name = "vesa-framebuffer";
- else if (si->orig_video_isVGA == VIDEO_TYPE_VGAC)
- name = "vga-framebuffer";
- else if (si->orig_video_isVGA == VIDEO_TYPE_EGAC)
+ switch (type) {
+ case VIDEO_TYPE_EGAC:
name = "ega-framebuffer";
- else
+ break;
+ case VIDEO_TYPE_VGAC:
+ name = "vga-framebuffer";
+ break;
+ case VIDEO_TYPE_VLFB:
+ name = "vesa-framebuffer";
+ break;
+ case VIDEO_TYPE_EFI:
+ name = "efi-framebuffer";
+ break;
+ default:
name = "platform-framebuffer";
+ break;
+ }
pd = platform_device_alloc(name, 0);
if (!pd) {

View File

@@ -0,0 +1,113 @@
From ba4c83076943b477c90015581cc88e262a7d772f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Feb 2025 16:01:57 +0100
Subject: x86/iopl: Cure TIF_IO_BITMAP inconsistencies
io_bitmap_exit() is invoked from exit_thread() when a task exists or
when a fork fails. In the latter case the exit_thread() cleans up
resources which were allocated during fork().
io_bitmap_exit() invokes task_update_io_bitmap(), which in turn ends up
in tss_update_io_bitmap(). tss_update_io_bitmap() operates on the
current task. If current has TIF_IO_BITMAP set, but no bitmap installed,
tss_update_io_bitmap() crashes with a NULL pointer dereference.
There are two issues, which lead to that problem:
1) io_bitmap_exit() should not invoke task_update_io_bitmap() when
the task, which is cleaned up, is not the current task. That's a
clear indicator for a cleanup after a failed fork().
2) A task should not have TIF_IO_BITMAP set and neither a bitmap
installed nor IOPL emulation level 3 activated.
This happens when a kernel thread is created in the context of
a user space thread, which has TIF_IO_BITMAP set as the thread
flags are copied and the IO bitmap pointer is cleared.
Other than in the failed fork() case this has no impact because
kernel threads including IO workers never return to user space and
therefore never invoke tss_update_io_bitmap().
Cure this by adding the missing cleanups and checks:
1) Prevent io_bitmap_exit() to invoke task_update_io_bitmap() if
the to be cleaned up task is not the current task.
2) Clear TIF_IO_BITMAP in copy_thread() unconditionally. For user
space forks it is set later, when the IO bitmap is inherited in
io_bitmap_share().
For paranoia sake, add a warning into tss_update_io_bitmap() to catch
the case, when that code is invoked with inconsistent state.
Fixes: ea5f1cd7ab49 ("x86/ioperm: Remove bitmap if all permissions dropped")
Reported-by: syzbot+e2b1803445d236442e54@syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/87wmdceom2.ffs@tglx
---
arch/x86/kernel/ioport.c | 13 +++++++++----
arch/x86/kernel/process.c | 6 ++++++
2 files changed, 15 insertions(+), 4 deletions(-)
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct
set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
}
-static void task_update_io_bitmap(struct task_struct *tsk)
+static void task_update_io_bitmap(void)
{
+ struct task_struct *tsk = current;
struct thread_struct *t = &tsk->thread;
if (t->iopl_emul == 3 || t->io_bitmap) {
@@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *
struct io_bitmap *iobm = tsk->thread.io_bitmap;
tsk->thread.io_bitmap = NULL;
- task_update_io_bitmap(tsk);
+ /*
+ * Don't touch the TSS when invoked on a failed fork(). TSS
+ * reflects the state of @current and not the state of @tsk.
+ */
+ if (tsk == current)
+ task_update_io_bitmap();
if (iobm && refcount_dec_and_test(&iobm->refcnt))
kfree(iobm);
}
@@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, leve
}
t->iopl_emul = level;
- task_update_io_bitmap(current);
-
+ task_update_io_bitmap();
return 0;
}
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -181,6 +181,7 @@ int copy_thread(struct task_struct *p, c
frame->ret_addr = (unsigned long) ret_from_fork_asm;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
+ clear_tsk_thread_flag(p, TIF_IO_BITMAP);
p->thread.iopl_warn = 0;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
@@ -469,6 +470,11 @@ void native_tss_update_io_bitmap(void)
} else {
struct io_bitmap *iobm = t->io_bitmap;
+ if (WARN_ON_ONCE(!iobm)) {
+ clear_thread_flag(TIF_IO_BITMAP);
+ native_tss_invalidate_io_bitmap();
+ }
+
/*
* Only copy bitmap data when the sequence number differs. The
* update time is accounted to the incoming task.

View File

@@ -0,0 +1,200 @@
From 7856e6900a09ed537366a5e0c774be8926ee022e Mon Sep 17 00:00:00 2001
From: Luo Gengkun <luogengkun@huaweicloud.com>
Date: Mon, 21 Apr 2025 03:50:21 +0000
Subject: watchdog: fix watchdog may detect false positive of softlockup
When updating `watchdog_thresh`, there is a race condition between writing
the new `watchdog_thresh` value and stopping the old watchdog timer. If
the old timer triggers during this window, it may falsely detect a
softlockup due to the old interval and the new `watchdog_thresh` value
being used. The problem can be described as follow:
# We asuume previous watchdog_thresh is 60, so the watchdog timer is
# coming every 24s.
echo 10 > /proc/sys/kernel/watchdog_thresh (User space)
|
+------>+ update watchdog_thresh (We are in kernel now)
|
| # using old interval and new `watchdog_thresh`
+------>+ watchdog hrtimer (irq context: detect softlockup)
|
|
+-------+
|
|
+ softlockup_stop_all
To fix this problem, introduce a shadow variable for `watchdog_thresh`.
The update to the actual `watchdog_thresh` is delayed until after the old
timer is stopped, preventing false positives.
The following testcase may help to understand this problem.
---------------------------------------------
echo RT_RUNTIME_SHARE > /sys/kernel/debug/sched/features
echo -1 > /proc/sys/kernel/sched_rt_runtime_us
echo 0 > /sys/kernel/debug/sched/fair_server/cpu3/runtime
echo 60 > /proc/sys/kernel/watchdog_thresh
taskset -c 3 chrt -r 99 /bin/bash -c "while true;do true; done" &
echo 10 > /proc/sys/kernel/watchdog_thresh &
---------------------------------------------
The test case above first removes the throttling restrictions for
real-time tasks. It then sets watchdog_thresh to 60 and executes a
real-time task ,a simple while(1) loop, on cpu3. Consequently, the final
command gets blocked because the presence of this real-time thread
prevents kworker:3 from being selected by the scheduler. This eventually
triggers a softlockup detection on cpu3 due to watchdog_timer_fn operating
with inconsistent variable - using both the old interval and the updated
watchdog_thresh simultaneously.
[nysal@linux.ibm.com: fix the SOFTLOCKUP_DETECTOR=n case]
Link: https://lkml.kernel.org/r/20250502111120.282690-1-nysal@linux.ibm.com
Link: https://lkml.kernel.org/r/20250421035021.3507649-1-luogengkun@huaweicloud.com
Signed-off-by: Luo Gengkun <luogengkun@huaweicloud.com>
Signed-off-by: Nysal Jan K.A. <nysal@linux.ibm.com>
Cc: Doug Anderson <dianders@chromium.org>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Song Liu <song@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Nysal Jan K.A." <nysal@linux.ibm.com>
Cc: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
kernel/watchdog.c | 41 +++++++++++++++++++++++++++--------------
1 file changed, 27 insertions(+), 14 deletions(-)
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_thresh_next;
static int __read_mostly watchdog_hardlockup_available;
struct cpumask watchdog_cpumask __read_mostly;
@@ -870,12 +871,20 @@ int lockup_detector_offline_cpu(unsigned
return 0;
}
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
softlockup_stop_all();
+ /*
+ * To prevent watchdog_timer_fn from using the old interval and
+ * the new watchdog_thresh at the same time, which could lead to
+ * false softlockup reports, it is necessary to update the
+ * watchdog_thresh after the softlockup is completed.
+ */
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
@@ -888,7 +897,7 @@ static void __lockup_detector_reconfigur
void lockup_detector_reconfigure(void)
{
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
mutex_unlock(&watchdog_mutex);
}
@@ -908,27 +917,29 @@ static __init void lockup_detector_setup
return;
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
lockup_detector_update_enable();
watchdog_hardlockup_start();
cpus_read_unlock();
}
void lockup_detector_reconfigure(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
static inline void lockup_detector_setup(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
@@ -946,11 +957,11 @@ void lockup_detector_soft_poweroff(void)
#ifdef CONFIG_SYSCTL
/* Propagate any changes to the watchdog infrastructure */
-static void proc_watchdog_update(void)
+static void proc_watchdog_update(bool thresh_changed)
{
/* Remove impossible cpus to keep sysctl output clean. */
cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(thresh_changed);
}
/*
@@ -984,7 +995,7 @@ static int proc_watchdog_common(int whic
} else {
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!err && old != READ_ONCE(*param))
- proc_watchdog_update();
+ proc_watchdog_update(false);
}
mutex_unlock(&watchdog_mutex);
return err;
@@ -1035,11 +1046,13 @@ static int proc_watchdog_thresh(const st
mutex_lock(&watchdog_mutex);
- old = READ_ONCE(watchdog_thresh);
+ watchdog_thresh_next = READ_ONCE(watchdog_thresh);
+
+ old = watchdog_thresh_next;
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!err && write && old != READ_ONCE(watchdog_thresh))
- proc_watchdog_update();
+ if (!err && write && old != READ_ONCE(watchdog_thresh_next))
+ proc_watchdog_update(true);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1060,7 +1073,7 @@ static int proc_watchdog_cpumask(const s
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write)
- proc_watchdog_update();
+ proc_watchdog_update(false);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1080,7 +1093,7 @@ static const struct ctl_table watchdog_s
},
{
.procname = "watchdog_thresh",
- .data = &watchdog_thresh,
+ .data = &watchdog_thresh_next,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog_thresh,

View File

@@ -0,0 +1,288 @@
From 45c6602b7fa2a9dfd05a1f9289504c2437205ce4 Mon Sep 17 00:00:00 2001
From: Harshit Agarwal <harshit@nutanix.com>
Date: Tue, 25 Feb 2025 18:05:53 +0000
Subject: sched/rt: Fix race in push_rt_task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Overview
========
When a CPU chooses to call push_rt_task and picks a task to push to
another CPU's runqueue then it will call find_lock_lowest_rq method
which would take a double lock on both CPUs' runqueues. If one of the
locks aren't readily available, it may lead to dropping the current
runqueue lock and reacquiring both the locks at once. During this window
it is possible that the task is already migrated and is running on some
other CPU. These cases are already handled. However, if the task is
migrated and has already been executed and another CPU is now trying to
wake it up (ttwu) such that it is queued again on the runqeue
(on_rq is 1) and also if the task was run by the same CPU, then the
current checks will pass even though the task was migrated out and is no
longer in the pushable tasks list.
Crashes
=======
This bug resulted in quite a few flavors of crashes triggering kernel
panics with various crash signatures such as assert failures, page
faults, null pointer dereferences, and queue corruption errors all
coming from scheduler itself.
Some of the crashes:
-> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? pick_next_task_rt+0x6e/0x1d0
? do_error_trap+0x64/0xa0
? pick_next_task_rt+0x6e/0x1d0
? exc_invalid_op+0x4c/0x60
? pick_next_task_rt+0x6e/0x1d0
? asm_exc_invalid_op+0x12/0x20
? pick_next_task_rt+0x6e/0x1d0
__schedule+0x5cb/0x790
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: kernel NULL pointer dereference, address: 00000000000000c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? __warn+0x8a/0xe0
? exc_page_fault+0x3d6/0x520
? asm_exc_page_fault+0x1e/0x30
? pick_next_task_rt+0xb5/0x1d0
? pick_next_task_rt+0x8c/0x1d0
__schedule+0x583/0x7e0
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: unable to handle page fault for address: ffff9464daea5900
kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p))
-> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? dequeue_top_rt_rq+0xa2/0xb0
? do_error_trap+0x64/0xa0
? dequeue_top_rt_rq+0xa2/0xb0
? exc_invalid_op+0x4c/0x60
? dequeue_top_rt_rq+0xa2/0xb0
? asm_exc_invalid_op+0x12/0x20
? dequeue_top_rt_rq+0xa2/0xb0
dequeue_rt_entity+0x1f/0x70
dequeue_task_rt+0x2d/0x70
__schedule+0x1a8/0x7e0
? blk_finish_plug+0x25/0x40
schedule+0x3c/0xb0
futex_wait_queue_me+0xb6/0x120
futex_wait+0xd9/0x240
do_futex+0x344/0xa90
? get_mm_exe_file+0x30/0x60
? audit_exe_compare+0x58/0x70
? audit_filter_rules.constprop.26+0x65e/0x1220
__x64_sys_futex+0x148/0x1f0
do_syscall_64+0x30/0x80
entry_SYSCALL_64_after_hwframe+0x62/0xc7
-> BUG: unable to handle page fault for address: ffff8cf3608bc2c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? spurious_kernel_fault+0x171/0x1c0
? exc_page_fault+0x3b6/0x520
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? asm_exc_page_fault+0x1e/0x30
? _cond_resched+0x15/0x30
? futex_wait_queue_me+0xc8/0x120
? futex_wait+0xd9/0x240
? try_to_wake_up+0x1b8/0x490
? futex_wake+0x78/0x160
? do_futex+0xcd/0xa90
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? plist_del+0x6a/0xd0
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? dequeue_pushable_task+0x20/0x70
? __schedule+0x382/0x7e0
? asm_sysvec_reschedule_ipi+0xa/0x20
? schedule+0x3c/0xb0
? exit_to_user_mode_prepare+0x9e/0x150
? irqentry_exit_to_user_mode+0x5/0x30
? asm_sysvec_reschedule_ipi+0x12/0x20
Above are some of the common examples of the crashes that were observed
due to this issue.
Details
=======
Let's look at the following scenario to understand this race.
1) CPU A enters push_rt_task
a) CPU A has chosen next_task = task p.
b) CPU A calls find_lock_lowest_rq(Task p, CPU Zs rq).
c) CPU A identifies CPU X as a destination CPU (X < Z).
d) CPU A enters double_lock_balance(CPU Zs rq, CPU Xs rq).
e) Since X is lower than Z, CPU A unlocks CPU Zs rq. Someone else has
locked CPU Xs rq, and thus, CPU A must wait.
2) At CPU Z
a) Previous task has completed execution and thus, CPU Z enters
schedule, locks its own rq after CPU A releases it.
b) CPU Z dequeues previous task and begins executing task p.
c) CPU Z unlocks its rq.
d) Task p yields the CPU (ex. by doing IO or waiting to acquire a
lock) which triggers the schedule function on CPU Z.
e) CPU Z enters schedule again, locks its own rq, and dequeues task p.
f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq.
3) At CPU B
a) CPU B enters try_to_wake_up with input task p.
b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates
B.state = WAKING.
c) CPU B via select_task_rq determines CPU Y as the target CPU.
4) The race
a) CPU A acquires CPU Xs lock and relocks CPU Z.
b) CPU A reads task p.cpu = Z and incorrectly concludes task p is
still on CPU Z.
c) CPU A failed to notice task p had been dequeued from CPU Z while
CPU A was waiting for locks in double_lock_balance. If CPU A knew
that task p had been dequeued, it would return NULL forcing
push_rt_task to give up the task p's migration.
d) CPU B updates task p.cpu = Y and calls ttwu_queue.
e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task
p.on_rq = 1.
f) CPU B unlocks CPU Y, triggering memory synchronization.
g) CPU A reads task p.on_rq = 1, cementing its assumption that task p
has not migrated.
h) CPU A decides to migrate p to CPU X.
This leads to A dequeuing p from Y's queue and various crashes down the
line.
Solution
========
The solution here is fairly simple. After obtaining the lock (at 4a),
the check is enhanced to make sure that the task is still at the head of
the pushable tasks list. If not, then it is anyway not suitable for
being pushed out.
Testing
=======
The fix is tested on a cluster of 3 nodes, where the panics due to this
are hit every couple of days. A fix similar to this was deployed on such
cluster and was stable for more than 30 days.
Co-developed-by: Jon Kohler <jon@nutanix.com>
Signed-off-by: Jon Kohler <jon@nutanix.com>
Co-developed-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
Signed-off-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
Co-developed-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
Signed-off-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
Signed-off-by: Harshit Agarwal <harshit@nutanix.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Tested-by: Will Ton <william.ton@nutanix.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20250225180553.167995-1-harshit@nutanix.com
---
kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 28 deletions(-)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1883,6 +1883,27 @@ static int find_lowest_rq(struct task_st
return -1;
}
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
{
@@ -1913,18 +1934,16 @@ static struct rq *find_lock_lowest_rq(st
/*
* We had to unlock the run queue. In
* the mean time, task could have
- * migrated already or had its affinity changed.
- * Also make sure that it wasn't scheduled on its rq.
+ * migrated already or had its affinity changed,
+ * therefore check if the task is still at the
+ * head of the pushable tasks list.
* It is possible the task was scheduled, set
* "migrate_disabled" and then got preempted, so we must
* check the task migration disable flag here too.
*/
- if (unlikely(task_rq(task) != rq ||
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !rt_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ task != pick_next_pushable_task(rq))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1944,27 +1963,6 @@ static struct rq *find_lock_lowest_rq(st
return lowest_rq;
}
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_tasks(rq))
- return NULL;
-
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(task_current_donor(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
-
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!rt_task(p));
-
- return p;
-}
-
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task

View File

@@ -0,0 +1,62 @@
From 14b4658d3fa78b169f36e62e722a076a7c50afd8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 28 Jan 2025 15:39:49 +0100
Subject: sched/fair: Adhere to place_entity() constraints
Mike reports that commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
placement bug causing scheduling lag") relies on commit 4423af84b297
("sched/fair: optimize the PLACE_LAG when se->vlag is zero") to not
trip a WARN in place_entity().
What happens is that the lag of the very last entity is 0 per
definition -- the average of one element matches the value of that
element. Therefore place_entity() will match the condition skipping
the lag adjustment:
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
Without the 'se->vlag' condition -- it will attempt to adjust the zero
lag even though we're inserting into an empty tree.
Notably, we should have failed the 'cfs_rq->nr_queued' condition, but
don't because they didn't get updated.
Additionally, move update_load_add() after placement() as is
consistent with other place_entity() users -- this change is
non-functional, place_entity() does not use cfs_rq->load.
Fixes: 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/c216eb4ef0e0e0029c600aefc69d56681cee5581.camel@gmx.de
---
kernel/sched/fair.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3808,6 +3808,7 @@ static void reweight_entity(struct cfs_r
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
se->rel_deadline = 1;
+ cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
@@ -3834,10 +3835,11 @@ static void reweight_entity(struct cfs_r
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
- update_load_add(&cfs_rq->load, se->load.weight);
place_entity(cfs_rq, se, 0);
+ update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
/*
* The entity's vruntime has been adjusted, so let's check

View File

@@ -0,0 +1,184 @@
From 65419a1e04de111460c4f38c47f1db39e71c3357 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 21 May 2025 09:06:02 -0700
Subject: alloc_tag: handle module codetag load errors as module load failures
Failures inside codetag_load_module() are currently ignored. As a result
an error there would not cause a module load failure and freeing of the
associated resources. Correct this behavior by propagating the error code
to the caller and handling possible errors. With this change, error to
allocate percpu counters, which happens at this stage, will not be ignored
and will cause a module load failure and freeing of resources. With this
change we also do not need to disable memory allocation profiling when
this error happens, instead we fail to load the module.
Link: https://lkml.kernel.org/r/20250521160602.1940771-1-surenb@google.com
Fixes: 10075262888b ("alloc_tag: allocate percpu counters for module tags dynamically")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: Casey Chen <cachen@purestorage.com>
Closes: https://lore.kernel.org/all/20250520231620.15259-1-cachen@purestorage.com/
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Wang <00107082@163.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/codetag.h | 8 ++++----
kernel/module/main.c | 5 +++--
lib/alloc_tag.c | 12 +++++++-----
lib/codetag.c | 34 +++++++++++++++++++++++++---------
4 files changed, 39 insertions(+), 20 deletions(-)
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -36,8 +36,8 @@ union codetag_ref {
struct codetag_type_desc {
const char *section;
size_t tag_size;
- void (*module_load)(struct module *mod,
- struct codetag *start, struct codetag *end);
+ int (*module_load)(struct module *mod,
+ struct codetag *start, struct codetag *end);
void (*module_unload)(struct module *mod,
struct codetag *start, struct codetag *end);
#ifdef CONFIG_MODULES
@@ -89,7 +89,7 @@ void *codetag_alloc_module_section(struc
unsigned long align);
void codetag_free_module_sections(struct module *mod);
void codetag_module_replaced(struct module *mod, struct module *new_mod);
-void codetag_load_module(struct module *mod);
+int codetag_load_module(struct module *mod);
void codetag_unload_module(struct module *mod);
#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
@@ -103,7 +103,7 @@ codetag_alloc_module_section(struct modu
unsigned long align) { return NULL; }
static inline void codetag_free_module_sections(struct module *mod) {}
static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {}
-static inline void codetag_load_module(struct module *mod) {}
+static inline int codetag_load_module(struct module *mod) { return 0; }
static inline void codetag_unload_module(struct module *mod) {}
#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3399,11 +3399,12 @@ static int load_module(struct load_info
goto sysfs_cleanup;
}
+ if (codetag_load_module(mod))
+ goto sysfs_cleanup;
+
/* Get rid of temporary copy. */
free_copy(info, flags);
- codetag_load_module(mod);
-
/* Done! */
trace_module_load(mod);
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -618,15 +618,16 @@ out:
mas_unlock(&mas);
}
-static void load_module(struct module *mod, struct codetag *start, struct codetag *stop)
+static int load_module(struct module *mod, struct codetag *start, struct codetag *stop)
{
/* Allocate module alloc_tag percpu counters */
struct alloc_tag *start_tag;
struct alloc_tag *stop_tag;
struct alloc_tag *tag;
+ /* percpu counters for core allocations are already statically allocated */
if (!mod)
- return;
+ return 0;
start_tag = ct_to_alloc_tag(start);
stop_tag = ct_to_alloc_tag(stop);
@@ -638,12 +639,13 @@ static void load_module(struct module *m
free_percpu(tag->counters);
tag->counters = NULL;
}
- shutdown_mem_profiling(true);
- pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s. Memory allocation profiling is disabled!\n",
+ pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n",
mod->name);
- break;
+ return -ENOMEM;
}
}
+
+ return 0;
}
static void replace_module(struct module *mod, struct module *new_mod)
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -167,6 +167,7 @@ static int codetag_module_init(struct co
{
struct codetag_range range;
struct codetag_module *cmod;
+ int mod_id;
int err;
range = get_section_range(mod, cttype->desc.section);
@@ -190,11 +191,20 @@ static int codetag_module_init(struct co
cmod->range = range;
down_write(&cttype->mod_lock);
- err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
- if (err >= 0) {
- cttype->count += range_size(cttype, &range);
- if (cttype->desc.module_load)
- cttype->desc.module_load(mod, range.start, range.stop);
+ mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
+ if (mod_id >= 0) {
+ if (cttype->desc.module_load) {
+ err = cttype->desc.module_load(mod, range.start, range.stop);
+ if (!err)
+ cttype->count += range_size(cttype, &range);
+ else
+ idr_remove(&cttype->mod_idr, mod_id);
+ } else {
+ cttype->count += range_size(cttype, &range);
+ err = 0;
+ }
+ } else {
+ err = mod_id;
}
up_write(&cttype->mod_lock);
@@ -295,17 +305,23 @@ void codetag_module_replaced(struct modu
mutex_unlock(&codetag_lock);
}
-void codetag_load_module(struct module *mod)
+int codetag_load_module(struct module *mod)
{
struct codetag_type *cttype;
+ int ret = 0;
if (!mod)
- return;
+ return 0;
mutex_lock(&codetag_lock);
- list_for_each_entry(cttype, &codetag_types, link)
- codetag_module_init(cttype, mod);
+ list_for_each_entry(cttype, &codetag_types, link) {
+ ret = codetag_module_init(cttype, mod);
+ if (ret)
+ break;
+ }
mutex_unlock(&codetag_lock);
+
+ return ret;
}
void codetag_unload_module(struct module *mod)

View File

@@ -0,0 +1,29 @@
From 3848ddd6068c425b732da6e8c78b047ed28c6114 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 27 Apr 2025 12:39:59 -0400
Subject: svcrdma: Unregister the device if svc_rdma_accept() fails
To handle device removal, svc_rdma_accept() requests removal
notification for the underlying device when accepting a connection.
However svc_rdma_free() is not invoked if svc_rdma_accept() fails.
There needs to be a matching "unregister" in that case; otherwise
the device cannot be removed.
Fixes: c4de97f7c454 ("svcrdma: Handle device removal outside of the CM event handler")
Cc: stable@vger.kernel.org
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 +
1 file changed, 1 insertion(+)
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -575,6 +575,7 @@ static struct svc_xprt *svc_rdma_accept(
if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
ib_destroy_qp(newxprt->sc_qp);
rdma_destroy_id(newxprt->sc_cm_id);
+ rpcrdma_rn_unregister(dev, &newxprt->sc_rn);
/* This call to put will destroy the transport */
svc_xprt_put(&newxprt->sc_xprt);
return NULL;

View File

@@ -0,0 +1,53 @@
From 38b409dd5c2fd9496fde05db4fb538a7e3593922 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 21 May 2025 16:34:13 -0400
Subject: SUNRPC: Prevent hang on NFS mount with xprtsec=[m]tls
Engineers at Hammerspace noticed that sometimes mounting with
"xprtsec=tls" hangs for a minute or so, and then times out, even
when the NFS server is reachable and responsive.
kTLS shuts off data_ready callbacks if strp->msg_ready is set to
mitigate data_ready callbacks when a full TLS record is not yet
ready to be read from the socket.
Normally msg_ready is clear when the first TLS record arrives on
a socket. However, I observed that sometimes tls_setsockopt() sets
strp->msg_ready, and that prevents forward progress because
tls_data_ready() becomes a no-op.
Moreover, Jakub says: "If there's a full record queued at the time
when [tlshd] passes the socket back to the kernel, it's up to the
reader to read the already queued data out." So SunRPC cannot
expect a data_ready call when ingress data is already waiting.
Add an explicit poll after SunRPC's upper transport is set up to
pick up any data that arrived after the TLS handshake but before
transport set-up is complete.
Reported-by: Steve Sears <sjs@hammerspace.com>
Suggested-by: Jakub Kacinski <kuba@kernel.org>
Fixes: 75eb6af7acdf ("SUNRPC: Add a TCP-with-TLS RPC transport class")
Tested-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Mike Snitzer <snitzer@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
net/sunrpc/xprtsock.c | 5 +++++
1 file changed, 5 insertions(+)
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2740,6 +2740,11 @@ static void xs_tcp_tls_setup_socket(stru
}
rpc_shutdown_client(lower_clnt);
+ /* Check for ingress data that arrived before the socket's
+ * ->data_ready callback was set up.
+ */
+ xs_poll_check_readable(upper_transport);
+
out_unlock:
current_restore_flags(pflags, PF_MEMALLOC);
upper_transport->clnt = NULL;

View File

@@ -0,0 +1,89 @@
From c3e0e5bd29d97f8e5663026e8c2f25e08f1c4544 Mon Sep 17 00:00:00 2001
From: Saurabh Sengar <ssengar@linux.microsoft.com>
Date: Thu, 29 May 2025 03:18:30 -0700
Subject: hv_netvsc: fix potential deadlock in netvsc_vf_setxdp()
The MANA driver's probe registers netdevice via the following call chain:
mana_probe()
register_netdev()
register_netdevice()
register_netdevice() calls notifier callback for netvsc driver,
holding the netdev mutex via netdev_lock_ops().
Further this netvsc notifier callback end up attempting to acquire the
same lock again in dev_xdp_propagate() leading to deadlock.
netvsc_netdev_event()
netvsc_vf_setxdp()
dev_xdp_propagate()
This deadlock was not observed so far because net_shaper_ops was never set,
and thus the lock was effectively a no-op in this case. Fix this by using
netif_xdp_propagate() instead of dev_xdp_propagate() to avoid recursive
locking in this path.
And, since no deadlock is observed on the other path which is via
netvsc_probe, add the lock exclusivly for that path.
Also, clean up the unregistration path by removing the unnecessary call to
netvsc_vf_setxdp(), since unregister_netdevice_many_notify() already
performs this cleanup via dev_xdp_uninstall().
Fixes: 97246d6d21c2 ("net: hold netdev instance lock during ndo_bpf")
Cc: stable@vger.kernel.org
Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Tested-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1748513910-23963-1-git-send-email-ssengar@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
drivers/net/hyperv/netvsc_bpf.c | 2 +-
drivers/net/hyperv/netvsc_drv.c | 4 ++--
net/core/dev.c | 1 +
3 files changed, 4 insertions(+), 3 deletions(-)
--- a/drivers/net/hyperv/netvsc_bpf.c
+++ b/drivers/net/hyperv/netvsc_bpf.c
@@ -183,7 +183,7 @@ int netvsc_vf_setxdp(struct net_device *
xdp.command = XDP_SETUP_PROG;
xdp.prog = prog;
- ret = dev_xdp_propagate(vf_netdev, &xdp);
+ ret = netif_xdp_propagate(vf_netdev, &xdp);
if (ret && prog)
bpf_prog_put(prog);
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2462,8 +2462,6 @@ static int netvsc_unregister_vf(struct n
netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
- netvsc_vf_setxdp(vf_netdev, NULL);
-
reinit_completion(&net_device_ctx->vf_add);
netdev_rx_handler_unregister(vf_netdev);
netdev_upper_dev_unlink(vf_netdev, ndev);
@@ -2631,7 +2629,9 @@ static int netvsc_probe(struct hv_device
continue;
netvsc_prepare_bonding(vf_netdev);
+ netdev_lock_ops(vf_netdev);
netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE);
+ netdev_unlock_ops(vf_netdev);
__netvsc_vf_setup(net, vf_netdev);
break;
}
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9863,6 +9863,7 @@ int netif_xdp_propagate(struct net_devic
return dev->netdev_ops->ndo_bpf(dev, bpf);
}
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{

View File

@@ -0,0 +1,113 @@
From 0f48fca427618cecf6683fa8e46cb8d0b66bb93d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 9 Jun 2025 17:12:44 -0700
Subject: net: clear the dst when changing skb protocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A not-so-careful NAT46 BPF program can crash the kernel
if it indiscriminately flips ingress packets from v4 to v6:
BUG: kernel NULL pointer dereference, address: 0000000000000000
ip6_rcv_core (net/ipv6/ip6_input.c:190:20)
ipv6_rcv (net/ipv6/ip6_input.c:306:8)
process_backlog (net/core/dev.c:6186:4)
napi_poll (net/core/dev.c:6906:9)
net_rx_action (net/core/dev.c:7028:13)
do_softirq (kernel/softirq.c:462:3)
netif_rx (net/core/dev.c:5326:3)
dev_loopback_xmit (net/core/dev.c:4015:2)
ip_mc_finish_output (net/ipv4/ip_output.c:363:8)
NF_HOOK (./include/linux/netfilter.h:314:9)
ip_mc_output (net/ipv4/ip_output.c:400:5)
dst_output (./include/net/dst.h:459:9)
ip_local_out (net/ipv4/ip_output.c:130:9)
ip_send_skb (net/ipv4/ip_output.c:1496:8)
udp_send_skb (net/ipv4/udp.c:1040:8)
udp_sendmsg (net/ipv4/udp.c:1328:10)
The output interface has a 4->6 program attached at ingress.
We try to loop the multicast skb back to the sending socket.
Ingress BPF runs as part of netif_rx(), pushes a valid v6 hdr
and changes skb->protocol to v6. We enter ip6_rcv_core which
tries to use skb_dst(). But the dst is still an IPv4 one left
after IPv4 mcast output.
Clear the dst in all BPF helpers which change the protocol.
Try to preserve metadata dsts, those may carry non-routing
metadata.
Cc: stable@vger.kernel.org
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Fixes: d219df60a70e ("bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()")
Fixes: 1b00e0dfe7d0 ("bpf: update skb->protocol in bpf_skb_net_grow")
Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250610001245.1981782-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/core/filter.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3232,6 +3232,13 @@ static const struct bpf_func_proto bpf_s
.arg1_type = ARG_PTR_TO_CTX,
};
+static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
+{
+ skb->protocol = htons(proto);
+ if (skb_valid_dst(skb))
+ skb_dst_drop(skb);
+}
+
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
/* Caller already did skb_cow() with len as headroom,
@@ -3328,7 +3335,7 @@ static int bpf_skb_proto_4_to_6(struct s
}
}
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
skb_clear_hash(skb);
return 0;
@@ -3358,7 +3365,7 @@ static int bpf_skb_proto_6_to_4(struct s
}
}
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
skb_clear_hash(skb);
return 0;
@@ -3549,10 +3556,10 @@ static int bpf_skb_net_grow(struct sk_bu
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
}
if (skb_is_gso(skb)) {
@@ -3605,10 +3612,10 @@ static int bpf_skb_net_shrink(struct sk_
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);

View File

@@ -0,0 +1,67 @@
From 59765af017c206b162b2ceb8d56a171e40a17719 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Jun 2025 08:35:01 +0000
Subject: net_sched: sch_sfq: reject invalid perturb period
Gerrard Tai reported that SFQ perturb_period has no range check yet,
and this can be used to trigger a race condition fixed in a separate patch.
We want to make sure ctl->perturb_period * HZ will not overflow
and is positive.
Tested:
tc qd add dev lo root sfq perturb -10 # negative value : error
Error: sch_sfq: invalid perturb period.
tc qd add dev lo root sfq perturb 1000000000 # too big : error
Error: sch_sfq: invalid perturb period.
tc qd add dev lo root sfq perturb 2000000 # acceptable value
tc -s -d qd sh dev lo
qdisc sfq 8005: root refcnt 2 limit 127p quantum 64Kb depth 127 flows 128 divisor 1024 perturb 2000000sec
Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Gerrard Tai <gerrard.tai@starlabs.sg>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: stable@vger.kernel.org
Link: https://patch.msgid.link/20250611083501.1810459-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/sched/sch_sfq.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -653,6 +653,14 @@ static int sfq_change(struct Qdisc *sch,
NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
return -EINVAL;
}
+
+ if (ctl->perturb_period < 0 ||
+ ctl->perturb_period > INT_MAX / HZ) {
+ NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
+ return -EINVAL;
+ }
+ perturb_period = ctl->perturb_period * HZ;
+
if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
return -EINVAL;
@@ -669,14 +677,12 @@ static int sfq_change(struct Qdisc *sch,
headdrop = q->headdrop;
maxdepth = q->maxdepth;
maxflows = q->maxflows;
- perturb_period = q->perturb_period;
quantum = q->quantum;
flags = q->flags;
/* update and validate configuration */
if (ctl->quantum)
quantum = ctl->quantum;
- perturb_period = ctl->perturb_period * HZ;
if (ctl->flows)
maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
if (ctl->divisor) {

View File

@@ -0,0 +1,51 @@
From b504e1cd491c55390370059280d5fbaa045d5543 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 13 Jun 2025 19:26:50 +0200
Subject: posix-cpu-timers: fix race between handle_posix_cpu_timers() and
posix_cpu_timer_del()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If an exiting non-autoreaping task has already passed exit_notify() and
calls handle_posix_cpu_timers() from IRQ, it can be reaped by its parent
or debugger right after unlock_task_sighand().
If a concurrent posix_cpu_timer_del() runs at that moment, it won't be
able to detect timer->it.cpu.firing != 0: cpu_timer_task_rcu() and/or
lock_task_sighand() will fail.
Add the tsk->exit_state check into run_posix_cpu_timers() to fix this.
This fix is not needed if CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y, because
exit_task_work() is called before exit_notify(). But the check still
makes sense, task_work_add(&tsk->posix_cputimers_work.work) will fail
anyway in this case.
Cc: stable@vger.kernel.org
Reported-by: Benoît Sevens <bsevens@google.com>
Fixes: 0bdd2ed4138e ("sched: run_posix_cpu_timers: Don't check ->exit_state, use lock_task_sighand()")
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
kernel/time/posix-cpu-timers.c | 9 +++++++++
1 file changed, 9 insertions(+)
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void)
lockdep_assert_irqs_disabled();
/*
+ * Ensure that release_task(tsk) can't happen while
+ * handle_posix_cpu_timers() is running. Otherwise, a concurrent
+ * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
+ * miss timer->it.cpu.firing != 0.
+ */
+ if (tsk->exit_state)
+ return;
+
+ /*
* If the actual expiry is deferred to task work context and the
* work is already scheduled there is no point to do anything here.
*/

View File

@@ -0,0 +1,93 @@
From d7b5f2aa34c56bd2a2d3cda2a7eb7aeb24df6179 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 6 Jun 2025 13:50:32 +0100
Subject: mm/vma: reset VMA iterator on commit_merge() OOM failure
While an OOM failure in commit_merge() isn't really feasible due to the
allocation which might fail (a maple tree pre-allocation) being 'too small
to fail', we do need to handle this case correctly regardless.
In vma_merge_existing_range(), we can theoretically encounter failures
which result in an OOM error in two ways - firstly dup_anon_vma() might
fail with an OOM error, and secondly commit_merge() failing, ultimately,
to pre-allocate a maple tree node.
The abort logic for dup_anon_vma() resets the VMA iterator to the initial
range, ensuring that any logic looping on this iterator will correctly
proceed to the next VMA.
However the commit_merge() abort logic does not do the same thing. This
resulted in a syzbot report occurring because mlockall() iterates through
VMAs, is tolerant of errors, but ended up with an incorrect previous VMA
being specified due to incorrect iterator state.
While making this change, it became apparent we are duplicating logic -
the logic introduced in commit 41e6ddcaa0f1 ("mm/vma: add give_up_on_oom
option on modify/merge, use in uffd release") duplicates the
vmg->give_up_on_oom check in both abort branches.
Additionally, we observe that we can perform the anon_dup check safely on
dup_anon_vma() failure, as this will not be modified should this call
fail.
Finally, we need to reset the iterator in both cases, so now we can simply
use the exact same code to abort for both.
We remove the VM_WARN_ON(err != -ENOMEM) as it would be silly for this to
be otherwise and it allows us to implement the abort check more neatly.
Link: https://lkml.kernel.org/r/20250606125032.164249-1-lorenzo.stoakes@oracle.com
Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: syzbot+d16409ea9ecc16ed261a@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-mm/6842cc67.a00a0220.29ac89.003b.GAE@google.com/
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/vma.c | 22 ++++------------------
1 file changed, 4 insertions(+), 18 deletions(-)
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -927,26 +927,9 @@ static __must_check struct vm_area_struc
err = dup_anon_vma(next, middle, &anon_dup);
}
- if (err)
+ if (err || commit_merge(vmg))
goto abort;
- err = commit_merge(vmg);
- if (err) {
- VM_WARN_ON(err != -ENOMEM);
-
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
- /*
- * We've cleaned up any cloned anon_vma's, no VMAs have been
- * modified, no harm no foul if the user requests that we not
- * report this and just give up, leaving the VMAs unmerged.
- */
- if (!vmg->give_up_on_oom)
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- return NULL;
- }
-
khugepaged_enter_vma(vmg->target, vmg->flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
@@ -955,6 +938,9 @@ abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
/*
* This means we have failed to clone anon_vma's correctly, but no
* actual changes to VMAs have occurred, so no harm no foul - if the

View File

@@ -0,0 +1,90 @@
From db96fe27668a3bb56fa5d745d1c2eed49a95a56f Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 6 Jun 2025 10:28:07 +0100
Subject: mm: close theoretical race where stale TLB entries could linger
Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a
parallel reclaim leaving stale TLB entries") described a theoretical race
as such:
"""
Nadav Amit identified a theoretical race between page reclaim and mprotect
due to TLB flushes being batched outside of the PTL being held.
He described the race as follows:
CPU0 CPU1
---- ----
user accesses memory using RW PTE
[PTE now cached in TLB]
try_to_unmap_one()
==> ptep_get_and_clear()
==> set_tlb_ubc_flush_pending()
mprotect(addr, PROT_READ)
==> change_pte_range()
==> [ PTE non-present - no flush ]
user writes using cached RW PTE
...
try_to_unmap_flush()
The same type of race exists for reads when protecting for PROT_NONE and
also exists for operations that can leave an old TLB entry behind such as
munmap, mremap and madvise.
"""
The solution was to introduce flush_tlb_batched_pending() and call it
under the PTL from mprotect/madvise/munmap/mremap to complete any pending
tlb flushes.
However, while madvise_free_pte_range() and
madvise_cold_or_pageout_pte_range() were both retro-fitted to call
flush_tlb_batched_pending() immediately after initially acquiring the PTL,
they both temporarily release the PTL to split a large folio if they
stumble upon one. In this case, where re-acquiring the PTL
flush_tlb_batched_pending() must be called again, but it previously was
not. Let's fix that.
There are 2 Fixes: tags here: the first is the commit that fixed
madvise_free_pte_range(). The second is the commit that added
madvise_cold_or_pageout_pte_range(), which looks like it copy/pasted the
faulty pattern from madvise_free_pte_range().
This is a theoretical bug discovered during code review.
Link: https://lkml.kernel.org/r/20250606092809.4194056-1-ryan.roberts@arm.com
Fixes: 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries")
Fixes: 9c276cc65a58 ("mm: introduce MADV_COLD")
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Jann Horn <jannh@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/madvise.c | 2 ++
1 file changed, 2 insertions(+)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -503,6 +503,7 @@ restart:
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -736,6 +737,7 @@ static int madvise_free_pte_range(pmd_t
start_pte = pte;
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;

View File

@@ -0,0 +1,33 @@
From f8c6b0801edd6f50057610c67120ffb42027f2c2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 13 Jun 2025 11:01:49 -0600
Subject: io_uring/kbuf: don't truncate end buffer for multiple buffer peeks
If peeking a bunch of buffers, normally io_ring_buffers_peek() will
truncate the end buffer. This isn't optimal as presumably more data will
be arriving later, and hence it's better to stop with the last full
buffer rather than truncate the end buffer.
Cc: stable@vger.kernel.org
Fixes: 35c8711c8fc4 ("io_uring/kbuf: add helpers for getting/peeking multiple buffers")
Reported-by: Christian Mazakas <christian.mazakas@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
io_uring/kbuf.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -270,8 +270,11 @@ static int io_ring_buffers_peek(struct i
/* truncate end piece, if needed, for non partial buffers */
if (len > arg->max_len) {
len = arg->max_len;
- if (!(bl->flags & IOBL_INC))
+ if (!(bl->flags & IOBL_INC)) {
+ if (iov != arg->iovs)
+ break;
buf->len = len;
+ }
}
iov->iov_base = u64_to_user_ptr(buf->addr);

View File

@@ -0,0 +1,54 @@
From a2ef8773db38d0c3a41761dbed6fc57afa440161 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 13 Jun 2025 13:37:41 -0600
Subject: nvme: always punt polled uring_cmd end_io work to task_work
Currently NVMe uring_cmd completions will complete locally, if they are
polled. This is done because those completions are always invoked from
task context. And while that is true, there's no guarantee that it's
invoked under the right ring context, or even task. If someone does
NVMe passthrough via multiple threads and with a limited number of
poll queues, then ringA may find completions from ringB. For that case,
completing the request may not be sound.
Always just punt the passthrough completions via task_work, which will
redirect the completion, if needed.
Cc: stable@vger.kernel.org
Fixes: 585079b6e425 ("nvme: wire up async polling for io passthrough commands")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
drivers/nvme/host/ioctl.c | 21 +++++++--------------
1 file changed, 7 insertions(+), 14 deletions(-)
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -429,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
/*
- * For iopoll, complete it directly. Note that using the uring_cmd
- * helper for this is safe only because we check blk_rq_is_poll().
- * As that returns false if we're NOT on a polled queue, then it's
- * safe to use the polled completion helper.
- *
- * Otherwise, move the completion to task work.
+ * IOPOLL could potentially complete this request directly, but
+ * if multiple rings are polling on the same queue, then it's possible
+ * for one ring to find completions for another ring. Punting the
+ * completion via task_work will always direct it to the right
+ * location, rather than potentially complete requests for ringA
+ * under iopoll invocations from ringB.
*/
- if (blk_rq_is_poll(req)) {
- if (pdu->bio)
- blk_rq_unmap_user(pdu->bio);
- io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
- } else {
- io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
- }
-
+ io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
return RQ_END_IO_FREE;
}

View File

@@ -0,0 +1,33 @@
From bb51adf56b5adc7075252cd17136c2288c116602 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 11 Jun 2025 09:59:15 +0900
Subject: block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion
When blk_zone_write_plug_bio_endio() is called for a regular write BIO
used to emulate a zone append operation, that is, a BIO flagged with
BIO_EMULATES_ZONE_APPEND, the BIO operation code is restored to the
original REQ_OP_ZONE_APPEND but the BIO_EMULATES_ZONE_APPEND flag is not
cleared. Clear it to fully return the BIO to its orginal definition.
Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250611005915.89843-1-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/blk-zoned.c | 1 +
1 file changed, 1 insertion(+)
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1225,6 +1225,7 @@ void blk_zone_write_plug_bio_endio(struc
if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= REQ_OP_ZONE_APPEND;
+ bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
}
/*

View File

@@ -0,0 +1,65 @@
From 56ae62470a95ac8249c43f5c0d50da2a83c350e0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 11 Jun 2025 08:48:46 -0600
Subject: block: use plug request list tail for one-shot backmerge attempt
Previously, the block layer stored the requests in the plug list in
LIFO order. For this reason, blk_attempt_plug_merge() would check
just the head entry for a back merge attempt, and abort after that
unless requests for multiple queues existed in the plug list. If more
than one request is present in the plug list, this makes the one-shot
back merging less useful than before, as it'll always fail to find a
quick merge candidate.
Use the tail entry for the one-shot merge attempt, which is the last
added request in the list. If that fails, abort immediately unless
there are multiple queues available. If multiple queues are available,
then scan the list. Ideally the latter scan would be a backwards scan
of the list, but as it currently stands, the plug list is singly linked
and hence this isn't easily feasible.
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/linux-block/20250611121626.7252-1-abuehaze@amazon.com/
Reported-by: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
Fixes: e70c301faece ("block: don't reorder requests in blk_add_rq_to_plug")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/blk-merge.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -1127,20 +1127,20 @@ bool blk_attempt_plug_merge(struct reque
if (!plug || rq_list_empty(&plug->mq_list))
return false;
- rq_list_for_each(&plug->mq_list, rq) {
- if (rq->q == q) {
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
- BIO_MERGE_OK)
- return true;
- break;
- }
+ rq = plug->mq_list.tail;
+ if (rq->q == q)
+ return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK;
+ else if (!plug->multiple_queues)
+ return false;
- /*
- * Only keep iterating plug list for merges if we have multiple
- * queues
- */
- if (!plug->multiple_queues)
- break;
+ rq_list_for_each(&plug->mq_list, rq) {
+ if (rq->q != q)
+ continue;
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK)
+ return true;
+ break;
}
return false;
}