release 6.15.4
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
From 9c2fdcdf9d8963a6fa30005a859816639d0bbf95 Mon Sep 17 00:00:00 2001
|
||||
From b3dc27f64b5d62505ae9f03a6c342a43b0b7e0b2 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Tue, 27 May 2025 07:28:54 -0600
|
||||
Subject: Revert "Disable FOP_DONTCACHE for now due to bugs"
|
@@ -1,70 +0,0 @@
|
||||
From 1616d0edbdf3b36a8f4694d35bcf88fa1242c7e8 Mon Sep 17 00:00:00 2001
|
||||
From: Jinliang Zheng <alexjlzheng@tencent.com>
|
||||
Date: Tue, 15 Apr 2025 17:02:32 +0800
|
||||
Subject: mm: fix ratelimit_pages update error in dirty_ratio_handler()
|
||||
|
||||
In dirty_ratio_handler(), vm_dirty_bytes must be set to zero before
|
||||
calling writeback_set_ratelimit(), as global_dirty_limits() always
|
||||
prioritizes the value of vm_dirty_bytes.
|
||||
|
||||
It's domain_dirty_limits() that's relevant here, not node_dirty_ok:
|
||||
|
||||
dirty_ratio_handler
|
||||
writeback_set_ratelimit
|
||||
global_dirty_limits(&dirty_thresh) <- ratelimit_pages based on dirty_thresh
|
||||
domain_dirty_limits
|
||||
if (bytes) <- bytes = vm_dirty_bytes <--------+
|
||||
thresh = f1(bytes) <- prioritizes vm_dirty_bytes |
|
||||
else |
|
||||
thresh = f2(ratio) |
|
||||
ratelimit_pages = f3(dirty_thresh) |
|
||||
vm_dirty_bytes = 0 <- it's late! ---------------------+
|
||||
|
||||
This causes ratelimit_pages to still use the value calculated based on
|
||||
vm_dirty_bytes, which is wrong now.
|
||||
|
||||
|
||||
The impact visible to userspace is difficult to capture directly because
|
||||
there is no procfs/sysfs interface exported to user space. However, it
|
||||
will have a real impact on the balance of dirty pages.
|
||||
|
||||
For example:
|
||||
|
||||
1. On default, we have vm_dirty_ratio=40, vm_dirty_bytes=0
|
||||
|
||||
2. echo 8192 > dirty_bytes, then vm_dirty_bytes=8192,
|
||||
vm_dirty_ratio=0, and ratelimit_pages is calculated based on
|
||||
vm_dirty_bytes now.
|
||||
|
||||
3. echo 20 > dirty_ratio, then since vm_dirty_bytes is not reset to
|
||||
zero when writeback_set_ratelimit() -> global_dirty_limits() ->
|
||||
domain_dirty_limits() is called, reallimit_pages is still calculated
|
||||
based on vm_dirty_bytes instead of vm_dirty_ratio. This does not
|
||||
conform to the actual intent of the user.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250415090232.7544-1-alexjlzheng@tencent.com
|
||||
Fixes: 9d823e8f6b1b ("writeback: per task dirty rate limit")
|
||||
Signed-off-by: Jinliang Zheng <alexjlzheng@tencent.com>
|
||||
Reviewed-by: MengEn Sun <mengensun@tencent.com>
|
||||
Cc: Andrea Righi <andrea@betterlinux.com>
|
||||
Cc: Fenggaung Wu <fengguang.wu@intel.com>
|
||||
Cc: Jinliang Zheng <alexjlzheng@tencent.com>
|
||||
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/page-writeback.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/page-writeback.c
|
||||
+++ b/mm/page-writeback.c
|
||||
@@ -520,8 +520,8 @@ static int dirty_ratio_handler(const str
|
||||
|
||||
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
|
||||
- writeback_set_ratelimit();
|
||||
vm_dirty_bytes = 0;
|
||||
+ writeback_set_ratelimit();
|
||||
}
|
||||
return ret;
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
From 0274339dc053815d099e9c336f11c1e9e5641792 Mon Sep 17 00:00:00 2001
|
||||
From 0b8d9b7ae677a03629218f69037be3f342c5ee81 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Tue, 27 May 2025 07:28:55 -0600
|
||||
Subject: mm/filemap: unify read/write dropbehind naming
|
@@ -1,179 +0,0 @@
|
||||
From 87f7435508fde20e21c6b744723a3203e2045f46 Mon Sep 17 00:00:00 2001
|
||||
From: GONG Ruiqi <gongruiqi1@huawei.com>
|
||||
Date: Sun, 27 Apr 2025 10:53:03 +0800
|
||||
Subject: vgacon: Add check for vc_origin address range in vgacon_scroll()
|
||||
|
||||
Our in-house Syzkaller reported the following BUG (twice), which we
|
||||
believed was the same issue with [1]:
|
||||
|
||||
==================================================================
|
||||
BUG: KASAN: slab-out-of-bounds in vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
|
||||
Read of size 2 at addr ffff88800f5bef60 by task syz.7.2620/12393
|
||||
...
|
||||
Call Trace:
|
||||
<TASK>
|
||||
__dump_stack lib/dump_stack.c:88 [inline]
|
||||
dump_stack_lvl+0x72/0xa0 lib/dump_stack.c:106
|
||||
print_address_description.constprop.0+0x6b/0x3d0 mm/kasan/report.c:364
|
||||
print_report+0xba/0x280 mm/kasan/report.c:475
|
||||
kasan_report+0xa9/0xe0 mm/kasan/report.c:588
|
||||
vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
|
||||
vcs_write_buf_noattr drivers/tty/vt/vc_screen.c:493 [inline]
|
||||
vcs_write+0x586/0x840 drivers/tty/vt/vc_screen.c:690
|
||||
vfs_write+0x219/0x960 fs/read_write.c:584
|
||||
ksys_write+0x12e/0x260 fs/read_write.c:639
|
||||
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
|
||||
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
|
||||
entry_SYSCALL_64_after_hwframe+0x78/0xe2
|
||||
...
|
||||
</TASK>
|
||||
|
||||
Allocated by task 5614:
|
||||
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
|
||||
kasan_set_track+0x25/0x30 mm/kasan/common.c:52
|
||||
____kasan_kmalloc mm/kasan/common.c:374 [inline]
|
||||
__kasan_kmalloc+0x8f/0xa0 mm/kasan/common.c:383
|
||||
kasan_kmalloc include/linux/kasan.h:201 [inline]
|
||||
__do_kmalloc_node mm/slab_common.c:1007 [inline]
|
||||
__kmalloc+0x62/0x140 mm/slab_common.c:1020
|
||||
kmalloc include/linux/slab.h:604 [inline]
|
||||
kzalloc include/linux/slab.h:721 [inline]
|
||||
vc_do_resize+0x235/0xf40 drivers/tty/vt/vt.c:1193
|
||||
vgacon_adjust_height+0x2d4/0x350 drivers/video/console/vgacon.c:1007
|
||||
vgacon_font_set+0x1f7/0x240 drivers/video/console/vgacon.c:1031
|
||||
con_font_set drivers/tty/vt/vt.c:4628 [inline]
|
||||
con_font_op+0x4da/0xa20 drivers/tty/vt/vt.c:4675
|
||||
vt_k_ioctl+0xa10/0xb30 drivers/tty/vt/vt_ioctl.c:474
|
||||
vt_ioctl+0x14c/0x1870 drivers/tty/vt/vt_ioctl.c:752
|
||||
tty_ioctl+0x655/0x1510 drivers/tty/tty_io.c:2779
|
||||
vfs_ioctl fs/ioctl.c:51 [inline]
|
||||
__do_sys_ioctl fs/ioctl.c:871 [inline]
|
||||
__se_sys_ioctl+0x12d/0x190 fs/ioctl.c:857
|
||||
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
|
||||
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
|
||||
entry_SYSCALL_64_after_hwframe+0x78/0xe2
|
||||
|
||||
Last potentially related work creation:
|
||||
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
|
||||
__kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
|
||||
__call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
|
||||
netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
|
||||
__sock_release+0xb5/0x270 net/socket.c:663
|
||||
sock_close+0x1e/0x30 net/socket.c:1425
|
||||
__fput+0x408/0xab0 fs/file_table.c:384
|
||||
__fput_sync+0x4c/0x60 fs/file_table.c:465
|
||||
__do_sys_close fs/open.c:1580 [inline]
|
||||
__se_sys_close+0x68/0xd0 fs/open.c:1565
|
||||
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
|
||||
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
|
||||
entry_SYSCALL_64_after_hwframe+0x78/0xe2
|
||||
|
||||
Second to last potentially related work creation:
|
||||
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
|
||||
__kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
|
||||
__call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
|
||||
netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
|
||||
__sock_release+0xb5/0x270 net/socket.c:663
|
||||
sock_close+0x1e/0x30 net/socket.c:1425
|
||||
__fput+0x408/0xab0 fs/file_table.c:384
|
||||
task_work_run+0x154/0x240 kernel/task_work.c:239
|
||||
exit_task_work include/linux/task_work.h:45 [inline]
|
||||
do_exit+0x8e5/0x1320 kernel/exit.c:874
|
||||
do_group_exit+0xcd/0x280 kernel/exit.c:1023
|
||||
get_signal+0x1675/0x1850 kernel/signal.c:2905
|
||||
arch_do_signal_or_restart+0x80/0x3b0 arch/x86/kernel/signal.c:310
|
||||
exit_to_user_mode_loop kernel/entry/common.c:111 [inline]
|
||||
exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
|
||||
__syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline]
|
||||
syscall_exit_to_user_mode+0x1b3/0x1e0 kernel/entry/common.c:218
|
||||
do_syscall_64+0x66/0x110 arch/x86/entry/common.c:87
|
||||
entry_SYSCALL_64_after_hwframe+0x78/0xe2
|
||||
|
||||
The buggy address belongs to the object at ffff88800f5be000
|
||||
which belongs to the cache kmalloc-2k of size 2048
|
||||
The buggy address is located 2656 bytes to the right of
|
||||
allocated 1280-byte region [ffff88800f5be000, ffff88800f5be500)
|
||||
|
||||
...
|
||||
|
||||
Memory state around the buggy address:
|
||||
ffff88800f5bee00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
|
||||
ffff88800f5bee80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
|
||||
>ffff88800f5bef00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
|
||||
^
|
||||
ffff88800f5bef80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
|
||||
ffff88800f5bf000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
|
||||
==================================================================
|
||||
|
||||
By analyzing the vmcore, we found that vc->vc_origin was somehow placed
|
||||
one line prior to vc->vc_screenbuf when vc was in KD_TEXT mode, and
|
||||
further writings to /dev/vcs caused out-of-bounds reads (and writes
|
||||
right after) in vcs_write_buf_noattr().
|
||||
|
||||
Our further experiments show that in most cases, vc->vc_origin equals to
|
||||
vga_vram_base when the console is in KD_TEXT mode, and it's around
|
||||
vc->vc_screenbuf for the KD_GRAPHICS mode. But via triggerring a
|
||||
TIOCL_SETVESABLANK ioctl beforehand, we can make vc->vc_origin be around
|
||||
vc->vc_screenbuf while the console is in KD_TEXT mode, and then by
|
||||
writing the special 'ESC M' control sequence to the tty certain times
|
||||
(depends on the value of `vc->state.y - vc->vc_top`), we can eventually
|
||||
move vc->vc_origin prior to vc->vc_screenbuf. Here's the PoC, tested on
|
||||
QEMU:
|
||||
|
||||
```
|
||||
int main() {
|
||||
const int RI_NUM = 10; // should be greater than `vc->state.y - vc->vc_top`
|
||||
int tty_fd, vcs_fd;
|
||||
const char *tty_path = "/dev/tty0";
|
||||
const char *vcs_path = "/dev/vcs";
|
||||
const char escape_seq[] = "\x1bM"; // ESC + M
|
||||
const char trigger_seq[] = "Let's trigger an OOB write.";
|
||||
struct vt_sizes vt_size = { 70, 2 };
|
||||
int blank = TIOCL_BLANKSCREEN;
|
||||
|
||||
tty_fd = open(tty_path, O_RDWR);
|
||||
|
||||
char vesa_mode[] = { TIOCL_SETVESABLANK, 1 };
|
||||
ioctl(tty_fd, TIOCLINUX, vesa_mode);
|
||||
|
||||
ioctl(tty_fd, TIOCLINUX, &blank);
|
||||
ioctl(tty_fd, VT_RESIZE, &vt_size);
|
||||
|
||||
for (int i = 0; i < RI_NUM; ++i)
|
||||
write(tty_fd, escape_seq, sizeof(escape_seq) - 1);
|
||||
|
||||
vcs_fd = open(vcs_path, O_RDWR);
|
||||
write(vcs_fd, trigger_seq, sizeof(trigger_seq));
|
||||
|
||||
close(vcs_fd);
|
||||
close(tty_fd);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
To solve this problem, add an address range validation check in
|
||||
vgacon_scroll(), ensuring vc->vc_origin never precedes vc_screenbuf.
|
||||
|
||||
Reported-by: syzbot+9c09fda97a1a65ea859b@syzkaller.appspotmail.com
|
||||
Closes: https://syzkaller.appspot.com/bug?extid=9c09fda97a1a65ea859b [1]
|
||||
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
|
||||
Cc: stable@vger.kernel.org
|
||||
Co-developed-by: Yi Yang <yiyang13@huawei.com>
|
||||
Signed-off-by: Yi Yang <yiyang13@huawei.com>
|
||||
Signed-off-by: GONG Ruiqi <gongruiqi1@huawei.com>
|
||||
Signed-off-by: Helge Deller <deller@gmx.de>
|
||||
---
|
||||
drivers/video/console/vgacon.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/video/console/vgacon.c
|
||||
+++ b/drivers/video/console/vgacon.c
|
||||
@@ -1168,7 +1168,7 @@ static bool vgacon_scroll(struct vc_data
|
||||
c->vc_screenbuf_size - delta);
|
||||
c->vc_origin = vga_vram_end - c->vc_screenbuf_size;
|
||||
vga_rolled_over = 0;
|
||||
- } else
|
||||
+ } else if (oldo - delta >= (unsigned long)c->vc_screenbuf)
|
||||
c->vc_origin -= delta;
|
||||
c->vc_scr_end = c->vc_origin + c->vc_screenbuf_size;
|
||||
scr_memsetw((u16 *) (c->vc_origin), c->vc_video_erase_char,
|
@@ -1,102 +0,0 @@
|
||||
From 4aed4d2a911e165342a339c886101dbe3acad5e2 Mon Sep 17 00:00:00 2001
|
||||
From: Murad Masimov <m.masimov@mt-integration.ru>
|
||||
Date: Mon, 28 Apr 2025 18:34:06 +0300
|
||||
Subject: fbdev: Fix do_register_framebuffer to prevent null-ptr-deref in
|
||||
fb_videomode_to_var
|
||||
|
||||
If fb_add_videomode() in do_register_framebuffer() fails to allocate
|
||||
memory for fb_videomode, it will later lead to a null-ptr dereference in
|
||||
fb_videomode_to_var(), as the fb_info is registered while not having the
|
||||
mode in modelist that is expected to be there, i.e. the one that is
|
||||
described in fb_info->var.
|
||||
|
||||
================================================================
|
||||
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
|
||||
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
|
||||
CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
|
||||
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
|
||||
RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
|
||||
Call Trace:
|
||||
display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
|
||||
fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
|
||||
resize_screen drivers/tty/vt/vt.c:1176 [inline]
|
||||
vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
|
||||
fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
|
||||
fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
|
||||
do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
|
||||
fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
|
||||
vfs_ioctl fs/ioctl.c:48 [inline]
|
||||
__do_sys_ioctl fs/ioctl.c:753 [inline]
|
||||
__se_sys_ioctl fs/ioctl.c:739 [inline]
|
||||
__x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
|
||||
do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
|
||||
entry_SYSCALL_64_after_hwframe+0x67/0xd1
|
||||
================================================================
|
||||
|
||||
Even though fbcon_init() checks beforehand if fb_match_mode() in
|
||||
var_to_display() fails, it can not prevent the panic because fbcon_init()
|
||||
does not return error code. Considering this and the comment in the code
|
||||
about fb_match_mode() returning NULL - "This should not happen" - it is
|
||||
better to prevent registering the fb_info if its mode was not set
|
||||
successfully. Also move fb_add_videomode() closer to the beginning of
|
||||
do_register_framebuffer() to avoid having to do the cleanup on fail.
|
||||
|
||||
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
|
||||
|
||||
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
|
||||
Signed-off-by: Helge Deller <deller@gmx.de>
|
||||
---
|
||||
drivers/video/fbdev/core/fbmem.c | 18 +++++++++++-------
|
||||
1 file changed, 11 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/drivers/video/fbdev/core/fbmem.c
|
||||
+++ b/drivers/video/fbdev/core/fbmem.c
|
||||
@@ -388,7 +388,7 @@ static int fb_check_foreignness(struct f
|
||||
|
||||
static int do_register_framebuffer(struct fb_info *fb_info)
|
||||
{
|
||||
- int i;
|
||||
+ int i, err = 0;
|
||||
struct fb_videomode mode;
|
||||
|
||||
if (fb_check_foreignness(fb_info))
|
||||
@@ -397,10 +397,18 @@ static int do_register_framebuffer(struc
|
||||
if (num_registered_fb == FB_MAX)
|
||||
return -ENXIO;
|
||||
|
||||
- num_registered_fb++;
|
||||
for (i = 0 ; i < FB_MAX; i++)
|
||||
if (!registered_fb[i])
|
||||
break;
|
||||
+
|
||||
+ if (!fb_info->modelist.prev || !fb_info->modelist.next)
|
||||
+ INIT_LIST_HEAD(&fb_info->modelist);
|
||||
+
|
||||
+ fb_var_to_videomode(&mode, &fb_info->var);
|
||||
+ err = fb_add_videomode(&mode, &fb_info->modelist);
|
||||
+ if (err < 0)
|
||||
+ return err;
|
||||
+
|
||||
fb_info->node = i;
|
||||
refcount_set(&fb_info->count, 1);
|
||||
mutex_init(&fb_info->lock);
|
||||
@@ -426,16 +434,12 @@ static int do_register_framebuffer(struc
|
||||
if (bitmap_empty(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT))
|
||||
bitmap_fill(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT);
|
||||
|
||||
- if (!fb_info->modelist.prev || !fb_info->modelist.next)
|
||||
- INIT_LIST_HEAD(&fb_info->modelist);
|
||||
-
|
||||
if (fb_info->skip_vt_switch)
|
||||
pm_vt_switch_required(fb_info->device, false);
|
||||
else
|
||||
pm_vt_switch_required(fb_info->device, true);
|
||||
|
||||
- fb_var_to_videomode(&mode, &fb_info->var);
|
||||
- fb_add_videomode(&mode, &fb_info->modelist);
|
||||
+ num_registered_fb++;
|
||||
registered_fb[i] = fb_info;
|
||||
|
||||
#ifdef CONFIG_GUMSTIX_AM200EPD
|
@@ -1,4 +1,4 @@
|
||||
From de09560d2e6fbb14ea586063217277e5ebc1bc71 Mon Sep 17 00:00:00 2001
|
||||
From 2c1c3b3aafb153cbc3bd298db57cc7313d1601b1 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Tue, 27 May 2025 07:28:56 -0600
|
||||
Subject: mm/filemap: unify dropbehind flag testing and clearing
|
@@ -1,65 +0,0 @@
|
||||
From 10c7fce24a1ad9197a8eabbba454a9a872f03d5c Mon Sep 17 00:00:00 2001
|
||||
From: Murad Masimov <m.masimov@mt-integration.ru>
|
||||
Date: Mon, 28 Apr 2025 18:34:07 +0300
|
||||
Subject: fbdev: Fix fb_set_var to prevent null-ptr-deref in
|
||||
fb_videomode_to_var
|
||||
|
||||
If fb_add_videomode() in fb_set_var() fails to allocate memory for
|
||||
fb_videomode, later it may lead to a null-ptr dereference in
|
||||
fb_videomode_to_var(), as the fb_info is registered while not having the
|
||||
mode in modelist that is expected to be there, i.e. the one that is
|
||||
described in fb_info->var.
|
||||
|
||||
================================================================
|
||||
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
|
||||
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
|
||||
CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
|
||||
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
|
||||
RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
|
||||
Call Trace:
|
||||
display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
|
||||
fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
|
||||
resize_screen drivers/tty/vt/vt.c:1176 [inline]
|
||||
vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
|
||||
fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
|
||||
fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
|
||||
do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
|
||||
fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
|
||||
vfs_ioctl fs/ioctl.c:48 [inline]
|
||||
__do_sys_ioctl fs/ioctl.c:753 [inline]
|
||||
__se_sys_ioctl fs/ioctl.c:739 [inline]
|
||||
__x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
|
||||
do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
|
||||
entry_SYSCALL_64_after_hwframe+0x67/0xd1
|
||||
================================================================
|
||||
|
||||
The reason is that fb_info->var is being modified in fb_set_var(), and
|
||||
then fb_videomode_to_var() is called. If it fails to add the mode to
|
||||
fb_info->modelist, fb_set_var() returns error, but does not restore the
|
||||
old value of fb_info->var. Restore fb_info->var on failure the same way
|
||||
it is done earlier in the function.
|
||||
|
||||
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
|
||||
|
||||
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
|
||||
Signed-off-by: Helge Deller <deller@gmx.de>
|
||||
---
|
||||
drivers/video/fbdev/core/fbmem.c | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/video/fbdev/core/fbmem.c
|
||||
+++ b/drivers/video/fbdev/core/fbmem.c
|
||||
@@ -328,8 +328,10 @@ fb_set_var(struct fb_info *info, struct
|
||||
!list_empty(&info->modelist))
|
||||
ret = fb_add_videomode(&mode, &info->modelist);
|
||||
|
||||
- if (ret)
|
||||
+ if (ret) {
|
||||
+ info->var = old_var;
|
||||
return ret;
|
||||
+ }
|
||||
|
||||
event.info = info;
|
||||
event.data = &mode;
|
@@ -1,4 +1,4 @@
|
||||
From c041325f222c774573ad73d35939451a4e221e52 Mon Sep 17 00:00:00 2001
|
||||
From 61d27e9dadb2eb2b7596a11a37402452d97625f7 Mon Sep 17 00:00:00 2001
|
||||
From: Shivank Garg <shivankg@amd.com>
|
||||
Date: Mon, 26 May 2025 18:28:18 +0000
|
||||
Subject: mm/khugepaged: fix race with folio split/free using temporary
|
@@ -1,113 +0,0 @@
|
||||
From 13ccad7713b89e7693feb5346e7893dc8edce7a8 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Brauner <brauner@kernel.org>
|
||||
Date: Mon, 7 Apr 2025 11:54:15 +0200
|
||||
Subject: anon_inode: use a proper mode internally
|
||||
|
||||
This allows the VFS to not trip over anonymous inodes and we can add
|
||||
asserts based on the mode into the vfs. When we report it to userspace
|
||||
we can simply hide the mode to avoid regressions. I've audited all
|
||||
direct callers of alloc_anon_inode() and only secretmen overrides i_mode
|
||||
and i_op inode operations but it already uses a regular file.
|
||||
|
||||
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-1-53a44c20d44e@kernel.org
|
||||
Fixes: af153bb63a336 ("vfs: catch invalid modes in may_open()")
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Cc: stable@vger.kernel.org # all LTS kernels
|
||||
Reported-by: syzbot+5d8e79d323a13aa0b248@syzkaller.appspotmail.com
|
||||
Closes: https://lore.kernel.org/all/67ed3fb3.050a0220.14623d.0009.GAE@google.com
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
fs/anon_inodes.c | 36 ++++++++++++++++++++++++++++++++++++
|
||||
fs/internal.h | 3 +++
|
||||
fs/libfs.c | 8 +++++++-
|
||||
3 files changed, 46 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/anon_inodes.c
|
||||
+++ b/fs/anon_inodes.c
|
||||
@@ -24,10 +24,44 @@
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
+#include "internal.h"
|
||||
+
|
||||
static struct vfsmount *anon_inode_mnt __ro_after_init;
|
||||
static struct inode *anon_inode_inode __ro_after_init;
|
||||
|
||||
/*
|
||||
+ * User space expects anonymous inodes to have no file type in st_mode.
|
||||
+ *
|
||||
+ * In particular, 'lsof' has this legacy logic:
|
||||
+ *
|
||||
+ * type = s->st_mode & S_IFMT;
|
||||
+ * switch (type) {
|
||||
+ * ...
|
||||
+ * case 0:
|
||||
+ * if (!strcmp(p, "anon_inode"))
|
||||
+ * Lf->ntype = Ntype = N_ANON_INODE;
|
||||
+ *
|
||||
+ * to detect our old anon_inode logic.
|
||||
+ *
|
||||
+ * Rather than mess with our internal sane inode data, just fix it
|
||||
+ * up here in getattr() by masking off the format bits.
|
||||
+ */
|
||||
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
|
||||
+ struct kstat *stat, u32 request_mask,
|
||||
+ unsigned int query_flags)
|
||||
+{
|
||||
+ struct inode *inode = d_inode(path->dentry);
|
||||
+
|
||||
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
|
||||
+ stat->mode &= ~S_IFMT;
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct inode_operations anon_inode_operations = {
|
||||
+ .getattr = anon_inode_getattr,
|
||||
+};
|
||||
+
|
||||
+/*
|
||||
* anon_inodefs_dname() is called from d_path().
|
||||
*/
|
||||
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
|
||||
@@ -66,6 +100,7 @@ static struct inode *anon_inode_make_sec
|
||||
if (IS_ERR(inode))
|
||||
return inode;
|
||||
inode->i_flags &= ~S_PRIVATE;
|
||||
+ inode->i_op = &anon_inode_operations;
|
||||
error = security_inode_init_security_anon(inode, &QSTR(name),
|
||||
context_inode);
|
||||
if (error) {
|
||||
@@ -313,6 +348,7 @@ static int __init anon_inode_init(void)
|
||||
anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
|
||||
if (IS_ERR(anon_inode_inode))
|
||||
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
|
||||
+ anon_inode_inode->i_op = &anon_inode_operations;
|
||||
|
||||
return 0;
|
||||
}
|
||||
--- a/fs/internal.h
|
||||
+++ b/fs/internal.h
|
||||
@@ -343,3 +343,6 @@ static inline bool path_mounted(const st
|
||||
void file_f_owner_release(struct file *file);
|
||||
bool file_seek_cur_needs_f_lock(struct file *file);
|
||||
int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
|
||||
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
|
||||
+ struct kstat *stat, u32 request_mask,
|
||||
+ unsigned int query_flags);
|
||||
--- a/fs/libfs.c
|
||||
+++ b/fs/libfs.c
|
||||
@@ -1647,7 +1647,13 @@ struct inode *alloc_anon_inode(struct su
|
||||
* that it already _is_ on the dirty list.
|
||||
*/
|
||||
inode->i_state = I_DIRTY;
|
||||
- inode->i_mode = S_IRUSR | S_IWUSR;
|
||||
+ /*
|
||||
+ * Historically anonymous inodes didn't have a type at all and
|
||||
+ * userspace has come to rely on this. Internally they're just
|
||||
+ * regular files but S_IFREG is masked off when reporting
|
||||
+ * information to userspace.
|
||||
+ */
|
||||
+ inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
|
||||
inode->i_uid = current_fsuid();
|
||||
inode->i_gid = current_fsgid();
|
||||
inode->i_flags |= S_PRIVATE;
|
@@ -1,4 +1,4 @@
|
||||
From 76653593bdf5fda03717991681b5d60e2af015e9 Mon Sep 17 00:00:00 2001
|
||||
From 8135974e9e512fdf6d15f59947f95e44f2834c37 Mon Sep 17 00:00:00 2001
|
||||
From: Shivank Garg <shivankg@amd.com>
|
||||
Date: Wed, 30 Apr 2025 10:01:51 +0000
|
||||
Subject: mm: add folio_expected_ref_count() for reference count calculation
|
@@ -1,80 +0,0 @@
|
||||
From 5a3eea2c3e9675a8b713eef0d52b7c437f1f613b Mon Sep 17 00:00:00 2001
|
||||
From: Christian Brauner <brauner@kernel.org>
|
||||
Date: Mon, 7 Apr 2025 11:54:17 +0200
|
||||
Subject: anon_inode: explicitly block ->setattr()
|
||||
|
||||
It is currently possible to change the mode and owner of the single
|
||||
anonymous inode in the kernel:
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int ret, sfd;
|
||||
sigset_t mask;
|
||||
struct signalfd_siginfo fdsi;
|
||||
|
||||
sigemptyset(&mask);
|
||||
sigaddset(&mask, SIGINT);
|
||||
sigaddset(&mask, SIGQUIT);
|
||||
|
||||
ret = sigprocmask(SIG_BLOCK, &mask, NULL);
|
||||
if (ret < 0)
|
||||
_exit(1);
|
||||
|
||||
sfd = signalfd(-1, &mask, 0);
|
||||
if (sfd < 0)
|
||||
_exit(2);
|
||||
|
||||
ret = fchown(sfd, 5555, 5555);
|
||||
if (ret < 0)
|
||||
_exit(3);
|
||||
|
||||
ret = fchmod(sfd, 0777);
|
||||
if (ret < 0)
|
||||
_exit(3);
|
||||
|
||||
_exit(4);
|
||||
}
|
||||
|
||||
This is a bug. It's not really a meaningful one because anonymous inodes
|
||||
don't really figure into path lookup and they cannot be reopened via
|
||||
/proc/<pid>/fd/<nr> and can't be used for lookup itself. So they can
|
||||
only ever serve as direct references.
|
||||
|
||||
But it is still completely bogus to allow the mode and ownership or any
|
||||
of the properties of the anonymous inode to be changed. Block this!
|
||||
|
||||
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-3-53a44c20d44e@kernel.org
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Cc: stable@vger.kernel.org # all LTS kernels
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
fs/anon_inodes.c | 7 +++++++
|
||||
fs/internal.h | 2 ++
|
||||
2 files changed, 9 insertions(+)
|
||||
|
||||
--- a/fs/anon_inodes.c
|
||||
+++ b/fs/anon_inodes.c
|
||||
@@ -57,8 +57,15 @@ int anon_inode_getattr(struct mnt_idmap
|
||||
return 0;
|
||||
}
|
||||
|
||||
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
|
||||
+ struct iattr *attr)
|
||||
+{
|
||||
+ return -EOPNOTSUPP;
|
||||
+}
|
||||
+
|
||||
static const struct inode_operations anon_inode_operations = {
|
||||
.getattr = anon_inode_getattr,
|
||||
+ .setattr = anon_inode_setattr,
|
||||
};
|
||||
|
||||
/*
|
||||
--- a/fs/internal.h
|
||||
+++ b/fs/internal.h
|
||||
@@ -346,3 +346,5 @@ int statmount_mnt_idmap(struct mnt_idmap
|
||||
int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
|
||||
struct kstat *stat, u32 request_mask,
|
||||
unsigned int query_flags);
|
||||
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
|
||||
+ struct iattr *attr);
|
40
debian/patches/patchset-pf/fixes/0006-drm-i915-snps_hdmi_pll-Fix-64-bit-divisor-truncation.patch
vendored
Normal file
40
debian/patches/patchset-pf/fixes/0006-drm-i915-snps_hdmi_pll-Fix-64-bit-divisor-truncation.patch
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
From 3d1a493525955678c231ab7ccf0950c0ba2b9f45 Mon Sep 17 00:00:00 2001
|
||||
From: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
|
||||
Date: Fri, 13 Jun 2025 11:42:46 +0530
|
||||
Subject: drm/i915/snps_hdmi_pll: Fix 64-bit divisor truncation by using
|
||||
div64_u64
|
||||
|
||||
DIV_ROUND_CLOSEST_ULL uses do_div(), which expects a 32-bit divisor.
|
||||
When passing a 64-bit constant like CURVE2_MULTIPLIER, the value is
|
||||
silently truncated to u32, potentially leading to incorrect results
|
||||
on large divisors.
|
||||
|
||||
Replace DIV_ROUND_CLOSEST_ULL with div64_u64(), which correctly
|
||||
handles full 64-bit division. Since the result is clamped between
|
||||
1 and 127, rounding is unnecessary and truncating division
|
||||
is sufficient.
|
||||
|
||||
Fixes: 5947642004bf ("drm/i915/display: Add support for SNPS PHY HDMI PLL algorithm for DG2")
|
||||
Cc: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
|
||||
Cc: Suraj Kandpal <suraj.kandpal@intel.com>
|
||||
Cc: Jani Nikula <jani.nikula@intel.com>
|
||||
Cc: <stable@vger.kernel.org> # v6.15+
|
||||
Signed-off-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
|
||||
Cherry-picked-for: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/145
|
||||
---
|
||||
drivers/gpu/drm/i915/display/intel_snps_hdmi_pll.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/gpu/drm/i915/display/intel_snps_hdmi_pll.c
|
||||
+++ b/drivers/gpu/drm/i915/display/intel_snps_hdmi_pll.c
|
||||
@@ -103,8 +103,8 @@ static void get_ana_cp_int_prop(u64 vco_
|
||||
DIV_ROUND_DOWN_ULL(curve_1_interpolated, CURVE0_MULTIPLIER)));
|
||||
|
||||
ana_cp_int_temp =
|
||||
- DIV_ROUND_CLOSEST_ULL(DIV_ROUND_DOWN_ULL(adjusted_vco_clk1, curve_2_scaled1),
|
||||
- CURVE2_MULTIPLIER);
|
||||
+ div64_u64(DIV_ROUND_DOWN_ULL(adjusted_vco_clk1, curve_2_scaled1),
|
||||
+ CURVE2_MULTIPLIER);
|
||||
|
||||
*ana_cp_int = max(1, min(ana_cp_int_temp, 127));
|
||||
|
@@ -1,39 +0,0 @@
|
||||
From 8c9775d285f9755477a8b1f8b215102dce014ed2 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Brauner <brauner@kernel.org>
|
||||
Date: Mon, 7 Apr 2025 11:54:19 +0200
|
||||
Subject: anon_inode: raise SB_I_NODEV and SB_I_NOEXEC
|
||||
|
||||
It isn't possible to execute anonymous inodes because they cannot be
|
||||
opened in any way after they have been created. This includes execution:
|
||||
|
||||
execveat(fd_anon_inode, "", NULL, NULL, AT_EMPTY_PATH)
|
||||
|
||||
Anonymous inodes have inode->f_op set to no_open_fops which sets
|
||||
no_open() which returns ENXIO. That means any call to do_dentry_open()
|
||||
which is the endpoint of the do_open_execat() will fail. There's no
|
||||
chance to execute an anonymous inode. Unless a given subsystem overrides
|
||||
it ofc.
|
||||
|
||||
However, we should still harden this and raise SB_I_NODEV and
|
||||
SB_I_NOEXEC on the superblock itself so that no one gets any creative
|
||||
ideas.
|
||||
|
||||
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-5-53a44c20d44e@kernel.org
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Cc: stable@vger.kernel.org # all LTS kernels
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
fs/anon_inodes.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/fs/anon_inodes.c
|
||||
+++ b/fs/anon_inodes.c
|
||||
@@ -86,6 +86,8 @@ static int anon_inodefs_init_fs_context(
|
||||
struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
|
||||
if (!ctx)
|
||||
return -ENOMEM;
|
||||
+ fc->s_iflags |= SB_I_NOEXEC;
|
||||
+ fc->s_iflags |= SB_I_NODEV;
|
||||
ctx->dops = &anon_inodefs_dentry_operations;
|
||||
return 0;
|
||||
}
|
190
debian/patches/patchset-pf/fixes/0007-mm-shmem-swap-fix-softlockup-with-mTHP-swapin.patch
vendored
Normal file
190
debian/patches/patchset-pf/fixes/0007-mm-shmem-swap-fix-softlockup-with-mTHP-swapin.patch
vendored
Normal file
@@ -0,0 +1,190 @@
|
||||
From 3a317593ed60909e02e059a43b2ef588f95fd457 Mon Sep 17 00:00:00 2001
|
||||
From: Kairui Song <kasong@tencent.com>
|
||||
Date: Tue, 10 Jun 2025 01:17:51 +0800
|
||||
Subject: mm/shmem, swap: fix softlockup with mTHP swapin
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Following softlockup can be easily reproduced on my test machine with:
|
||||
|
||||
echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
|
||||
swapon /dev/zram0 # zram0 is a 48G swap device
|
||||
mkdir -p /sys/fs/cgroup/memory/test
|
||||
echo 1G > /sys/fs/cgroup/test/memory.max
|
||||
echo $BASHPID > /sys/fs/cgroup/test/cgroup.procs
|
||||
while true; do
|
||||
dd if=/dev/zero of=/tmp/test.img bs=1M count=5120
|
||||
cat /tmp/test.img > /dev/null
|
||||
rm /tmp/test.img
|
||||
done
|
||||
|
||||
Then after a while:
|
||||
watchdog: BUG: soft lockup - CPU#0 stuck for 763s! [cat:5787]
|
||||
Modules linked in: zram virtiofs
|
||||
CPU: 0 UID: 0 PID: 5787 Comm: cat Kdump: loaded Tainted: G L 6.15.0.orig-gf3021d9246bc-dirty #118 PREEMPT(voluntary)·
|
||||
Tainted: [L]=SOFTLOCKUP
|
||||
Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015
|
||||
RIP: 0010:mpol_shared_policy_lookup+0xd/0x70
|
||||
Code: e9 b8 b4 ff ff 31 c0 c3 cc cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 0f 1f 44 00 00 41 54 55 53 <48> 8b 1f 48 85 db 74 41 4c 8d 67 08 48 89 fb 48 89 f5 4c 89 e7 e8
|
||||
RSP: 0018:ffffc90002b1fc28 EFLAGS: 00000202
|
||||
RAX: 00000000001c20ca RBX: 0000000000724e1e RCX: 0000000000000001
|
||||
RDX: ffff888118e214c8 RSI: 0000000000057d42 RDI: ffff888118e21518
|
||||
RBP: 000000000002bec8 R08: 0000000000000001 R09: 0000000000000000
|
||||
R10: 0000000000000bf4 R11: 0000000000000000 R12: 0000000000000001
|
||||
R13: 00000000001c20ca R14: 00000000001c20ca R15: 0000000000000000
|
||||
FS: 00007f03f995c740(0000) GS:ffff88a07ad9a000(0000) knlGS:0000000000000000
|
||||
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
|
||||
CR2: 00007f03f98f1000 CR3: 0000000144626004 CR4: 0000000000770eb0
|
||||
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
|
||||
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
|
||||
PKRU: 55555554
|
||||
Call Trace:
|
||||
<TASK>
|
||||
shmem_alloc_folio+0x31/0xc0
|
||||
shmem_swapin_folio+0x309/0xcf0
|
||||
? filemap_get_entry+0x117/0x1e0
|
||||
? xas_load+0xd/0xb0
|
||||
? filemap_get_entry+0x101/0x1e0
|
||||
shmem_get_folio_gfp+0x2ed/0x5b0
|
||||
shmem_file_read_iter+0x7f/0x2e0
|
||||
vfs_read+0x252/0x330
|
||||
ksys_read+0x68/0xf0
|
||||
do_syscall_64+0x4c/0x1c0
|
||||
entry_SYSCALL_64_after_hwframe+0x76/0x7e
|
||||
RIP: 0033:0x7f03f9a46991
|
||||
Code: 00 48 8b 15 81 14 10 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8 20 ad 01 00 f3 0f 1e fa 80 3d 35 97 10 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec
|
||||
RSP: 002b:00007fff3c52bd28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
|
||||
RAX: ffffffffffffffda RBX: 0000000000040000 RCX: 00007f03f9a46991
|
||||
RDX: 0000000000040000 RSI: 00007f03f98ba000 RDI: 0000000000000003
|
||||
RBP: 00007fff3c52bd50 R08: 0000000000000000 R09: 00007f03f9b9a380
|
||||
R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000040000
|
||||
R13: 00007f03f98ba000 R14: 0000000000000003 R15: 0000000000000000
|
||||
</TASK>
|
||||
|
||||
The reason is simple, readahead brought some order 0 folio in swap cache,
|
||||
and the swapin mTHP folio being allocated is in conflict with it, so
|
||||
swapcache_prepare fails and causes shmem_swap_alloc_folio to return
|
||||
-EEXIST, and shmem simply retries again and again causing this loop.
|
||||
|
||||
Fix it by applying a similar fix for anon mTHP swapin.
|
||||
|
||||
The performance change is very slight, time of swapin 10g zero folios
|
||||
with shmem (test for 12 times):
|
||||
Before: 2.47s
|
||||
After: 2.48s
|
||||
|
||||
[kasong@tencent.com: add comment]
|
||||
Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com
|
||||
Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com
|
||||
Link: https://lkml.kernel.org/r/20250609171751.36305-1-ryncsn@gmail.com
|
||||
Fixes: 1dd44c0af4fa ("mm: shmem: skip swapcache for swapin of synchronous swap device")
|
||||
Signed-off-by: Kairui Song <kasong@tencent.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Nhat Pham <nphamcs@gmail.com>
|
||||
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
|
||||
Cc: Baoquan He <bhe@redhat.com>
|
||||
Cc: Chris Li <chrisl@kernel.org>
|
||||
Cc: Hugh Dickins <hughd@google.com>
|
||||
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
|
||||
Cc: Usama Arif <usamaarif642@gmail.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/memory.c | 20 --------------------
|
||||
mm/shmem.c | 6 +++++-
|
||||
mm/swap.h | 23 +++++++++++++++++++++++
|
||||
3 files changed, 28 insertions(+), 21 deletions(-)
|
||||
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -4225,26 +4225,6 @@ static struct folio *__alloc_swap_folio(
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
|
||||
-{
|
||||
- struct swap_info_struct *si = swp_swap_info(entry);
|
||||
- pgoff_t offset = swp_offset(entry);
|
||||
- int i;
|
||||
-
|
||||
- /*
|
||||
- * While allocating a large folio and doing swap_read_folio, which is
|
||||
- * the case the being faulted pte doesn't have swapcache. We need to
|
||||
- * ensure all PTEs have no cache as well, otherwise, we might go to
|
||||
- * swap devices while the content is in swapcache.
|
||||
- */
|
||||
- for (i = 0; i < max_nr; i++) {
|
||||
- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
|
||||
- return i;
|
||||
- }
|
||||
-
|
||||
- return i;
|
||||
-}
|
||||
-
|
||||
/*
|
||||
* Check if the PTEs within a range are contiguous swap entries
|
||||
* and have consistent swapcache, zeromap.
|
||||
--- a/mm/shmem.c
|
||||
+++ b/mm/shmem.c
|
||||
@@ -2262,6 +2262,7 @@ static int shmem_swapin_folio(struct ino
|
||||
folio = swap_cache_get_folio(swap, NULL, 0);
|
||||
order = xa_get_order(&mapping->i_pages, index);
|
||||
if (!folio) {
|
||||
+ int nr_pages = 1 << order;
|
||||
bool fallback_order0 = false;
|
||||
|
||||
/* Or update major stats only when swapin succeeds?? */
|
||||
@@ -2275,9 +2276,12 @@ static int shmem_swapin_folio(struct ino
|
||||
* If uffd is active for the vma, we need per-page fault
|
||||
* fidelity to maintain the uffd semantics, then fallback
|
||||
* to swapin order-0 folio, as well as for zswap case.
|
||||
+ * Any existing sub folio in the swap cache also blocks
|
||||
+ * mTHP swapin.
|
||||
*/
|
||||
if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
|
||||
- !zswap_never_enabled()))
|
||||
+ !zswap_never_enabled() ||
|
||||
+ non_swapcache_batch(swap, nr_pages) != nr_pages))
|
||||
fallback_order0 = true;
|
||||
|
||||
/* Skip swapcache for synchronous device. */
|
||||
--- a/mm/swap.h
|
||||
+++ b/mm/swap.h
|
||||
@@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp
|
||||
return find_next_bit(sis->zeromap, end, start) - start;
|
||||
}
|
||||
|
||||
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
|
||||
+{
|
||||
+ struct swap_info_struct *si = swp_swap_info(entry);
|
||||
+ pgoff_t offset = swp_offset(entry);
|
||||
+ int i;
|
||||
+
|
||||
+ /*
|
||||
+ * While allocating a large folio and doing mTHP swapin, we need to
|
||||
+ * ensure all entries are not cached, otherwise, the mTHP folio will
|
||||
+ * be in conflict with the folio in swap cache.
|
||||
+ */
|
||||
+ for (i = 0; i < max_nr; i++) {
|
||||
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
|
||||
+ return i;
|
||||
+ }
|
||||
+
|
||||
+ return i;
|
||||
+}
|
||||
+
|
||||
#else /* CONFIG_SWAP */
|
||||
struct swap_iocb;
|
||||
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
|
||||
@@ -199,6 +218,10 @@ static inline int swap_zeromap_batch(swp
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
#endif /* CONFIG_SWAP */
|
||||
|
||||
#endif /* _MM_SWAP_H */
|
@@ -1,136 +0,0 @@
|
||||
From d90681a50098e204f2e111b9433f6fc73a939854 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Brauner <brauner@kernel.org>
|
||||
Date: Mon, 21 Apr 2025 10:27:40 +0200
|
||||
Subject: fs: add S_ANON_INODE
|
||||
|
||||
This makes it easy to detect proper anonymous inodes and to ensure that
|
||||
we can detect them in codepaths such as readahead().
|
||||
|
||||
Readahead on anonymous inodes didn't work because they didn't have a
|
||||
proper mode. Now that they have we need to retain EINVAL being returned
|
||||
otherwise LTP will fail.
|
||||
|
||||
We also need to ensure that ioctls aren't simply fired like they are for
|
||||
regular files so things like inotify inodes continue to correctly call
|
||||
their own ioctl handlers as in [1].
|
||||
|
||||
Reported-by: Xilin Wu <sophon@radxa.com>
|
||||
Link: https://lore.kernel.org/3A9139D5CD543962+89831381-31b9-4392-87ec-a84a5b3507d8@radxa.com [1]
|
||||
Link: https://lore.kernel.org/7a1a7076-ff6b-4cb0-94e7-7218a0a44028@sirena.org.uk
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
fs/ioctl.c | 7 ++++---
|
||||
fs/libfs.c | 2 +-
|
||||
fs/pidfs.c | 2 +-
|
||||
include/linux/fs.h | 2 ++
|
||||
mm/readahead.c | 20 ++++++++++++++++----
|
||||
5 files changed, 24 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/fs/ioctl.c
|
||||
+++ b/fs/ioctl.c
|
||||
@@ -821,7 +821,8 @@ static int do_vfs_ioctl(struct file *fil
|
||||
return ioctl_fioasync(fd, filp, argp);
|
||||
|
||||
case FIOQSIZE:
|
||||
- if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
|
||||
+ if (S_ISDIR(inode->i_mode) ||
|
||||
+ (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) ||
|
||||
S_ISLNK(inode->i_mode)) {
|
||||
loff_t res = inode_get_bytes(inode);
|
||||
return copy_to_user(argp, &res, sizeof(res)) ?
|
||||
@@ -856,7 +857,7 @@ static int do_vfs_ioctl(struct file *fil
|
||||
return ioctl_file_dedupe_range(filp, argp);
|
||||
|
||||
case FIONREAD:
|
||||
- if (!S_ISREG(inode->i_mode))
|
||||
+ if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode))
|
||||
return vfs_ioctl(filp, cmd, arg);
|
||||
|
||||
return put_user(i_size_read(inode) - filp->f_pos,
|
||||
@@ -881,7 +882,7 @@ static int do_vfs_ioctl(struct file *fil
|
||||
return ioctl_get_fs_sysfs_path(filp, argp);
|
||||
|
||||
default:
|
||||
- if (S_ISREG(inode->i_mode))
|
||||
+ if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode))
|
||||
return file_ioctl(filp, cmd, argp);
|
||||
break;
|
||||
}
|
||||
--- a/fs/libfs.c
|
||||
+++ b/fs/libfs.c
|
||||
@@ -1656,7 +1656,7 @@ struct inode *alloc_anon_inode(struct su
|
||||
inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
|
||||
inode->i_uid = current_fsuid();
|
||||
inode->i_gid = current_fsgid();
|
||||
- inode->i_flags |= S_PRIVATE;
|
||||
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
|
||||
simple_inode_init_ts(inode);
|
||||
return inode;
|
||||
}
|
||||
--- a/fs/pidfs.c
|
||||
+++ b/fs/pidfs.c
|
||||
@@ -826,7 +826,7 @@ static int pidfs_init_inode(struct inode
|
||||
const struct pid *pid = data;
|
||||
|
||||
inode->i_private = data;
|
||||
- inode->i_flags |= S_PRIVATE;
|
||||
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
|
||||
inode->i_mode |= S_IRWXU;
|
||||
inode->i_op = &pidfs_inode_operations;
|
||||
inode->i_fop = &pidfs_file_operations;
|
||||
--- a/include/linux/fs.h
|
||||
+++ b/include/linux/fs.h
|
||||
@@ -2344,6 +2344,7 @@ struct super_operations {
|
||||
#define S_CASEFOLD (1 << 15) /* Casefolded file */
|
||||
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
|
||||
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
|
||||
+#define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */
|
||||
|
||||
/*
|
||||
* Note that nosuid etc flags are inode-specific: setting some file-system
|
||||
@@ -2400,6 +2401,7 @@ static inline bool sb_rdonly(const struc
|
||||
|
||||
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
|
||||
(inode)->i_rdev == WHITEOUT_DEV)
|
||||
+#define IS_ANON_FILE(inode) ((inode)->i_flags & S_ANON_INODE)
|
||||
|
||||
static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
|
||||
struct inode *inode)
|
||||
--- a/mm/readahead.c
|
||||
+++ b/mm/readahead.c
|
||||
@@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
|
||||
|
||||
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
|
||||
{
|
||||
+ struct file *file;
|
||||
+ const struct inode *inode;
|
||||
+
|
||||
CLASS(fd, f)(fd);
|
||||
+ if (fd_empty(f))
|
||||
+ return -EBADF;
|
||||
|
||||
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
|
||||
+ file = fd_file(f);
|
||||
+ if (!(file->f_mode & FMODE_READ))
|
||||
return -EBADF;
|
||||
|
||||
/*
|
||||
@@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t of
|
||||
* that can execute readahead. If readahead is not possible
|
||||
* on this file, then we must return -EINVAL.
|
||||
*/
|
||||
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
|
||||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
|
||||
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
|
||||
+ if (!file->f_mapping)
|
||||
+ return -EINVAL;
|
||||
+ if (!file->f_mapping->a_ops)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ inode = file_inode(file);
|
||||
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
|
||||
+ return -EINVAL;
|
||||
+ if (IS_ANON_FILE(inode))
|
||||
return -EINVAL;
|
||||
|
||||
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
|
100
debian/patches/patchset-pf/fixes/0008-mm-gup-revert-mm-gup-fix-infinite-loop-within-__get_.patch
vendored
Normal file
100
debian/patches/patchset-pf/fixes/0008-mm-gup-revert-mm-gup-fix-infinite-loop-within-__get_.patch
vendored
Normal file
@@ -0,0 +1,100 @@
|
||||
From 4b247e559e4046bbbfab468e66f9d3197eaf12ec Mon Sep 17 00:00:00 2001
|
||||
From: David Hildenbrand <david@redhat.com>
|
||||
Date: Wed, 11 Jun 2025 15:13:14 +0200
|
||||
Subject: mm/gup: revert "mm: gup: fix infinite loop within
|
||||
__get_longterm_locked"
|
||||
|
||||
After commit 1aaf8c122918 ("mm: gup: fix infinite loop within
|
||||
__get_longterm_locked") we are able to longterm pin folios that are not
|
||||
supposed to get longterm pinned, simply because they temporarily have the
|
||||
LRU flag cleared (esp. temporarily isolated).
|
||||
|
||||
For example, two __get_longterm_locked() callers can race, or
|
||||
__get_longterm_locked() can race with anything else that temporarily
|
||||
isolates folios.
|
||||
|
||||
The introducing commit mentions the use case of a driver that uses
|
||||
vm_ops->fault to insert pages allocated through cma_alloc() into the page
|
||||
tables, assuming they can later get longterm pinned. These pages/ folios
|
||||
would never have the LRU flag set and consequently cannot get isolated.
|
||||
There is no known in-tree user making use of that so far, fortunately.
|
||||
|
||||
To handle that in the future -- and avoid retrying forever to
|
||||
isolate/migrate them -- we will need a different mechanism for the CMA
|
||||
area *owner* to indicate that it actually already allocated the page and
|
||||
is fine with longterm pinning it. The LRU flag is not suitable for that.
|
||||
|
||||
Probably we can lookup the relevant CMA area and query the bitmap; we only
|
||||
have have to care about some races, probably. If already allocated, we
|
||||
could just allow longterm pinning)
|
||||
|
||||
Anyhow, let's fix the "must not be longterm pinned" problem first by
|
||||
reverting the original commit.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250611131314.594529-1-david@redhat.com
|
||||
Fixes: 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked")
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
Closes: https://lore.kernel.org/all/20250522092755.GA3277597@tiffany/
|
||||
Reported-by: Hyesoo Yu <hyesoo.yu@samsung.com>
|
||||
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
|
||||
Cc: Jason Gunthorpe <jgg@ziepe.ca>
|
||||
Cc: Peter Xu <peterx@redhat.com>
|
||||
Cc: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
|
||||
Cc: Aijun Sun <aijun.sun@unisoc.com>
|
||||
Cc: Alistair Popple <apopple@nvidia.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/gup.c | 14 ++++++++++----
|
||||
1 file changed, 10 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/mm/gup.c
|
||||
+++ b/mm/gup.c
|
||||
@@ -2320,13 +2320,13 @@ static void pofs_unpin(struct pages_or_f
|
||||
/*
|
||||
* Returns the number of collected folios. Return value is always >= 0.
|
||||
*/
|
||||
-static void collect_longterm_unpinnable_folios(
|
||||
+static unsigned long collect_longterm_unpinnable_folios(
|
||||
struct list_head *movable_folio_list,
|
||||
struct pages_or_folios *pofs)
|
||||
{
|
||||
+ unsigned long i, collected = 0;
|
||||
struct folio *prev_folio = NULL;
|
||||
bool drain_allow = true;
|
||||
- unsigned long i;
|
||||
|
||||
for (i = 0; i < pofs->nr_entries; i++) {
|
||||
struct folio *folio = pofs_get_folio(pofs, i);
|
||||
@@ -2338,6 +2338,8 @@ static void collect_longterm_unpinnable_
|
||||
if (folio_is_longterm_pinnable(folio))
|
||||
continue;
|
||||
|
||||
+ collected++;
|
||||
+
|
||||
if (folio_is_device_coherent(folio))
|
||||
continue;
|
||||
|
||||
@@ -2359,6 +2361,8 @@ static void collect_longterm_unpinnable_
|
||||
NR_ISOLATED_ANON + folio_is_file_lru(folio),
|
||||
folio_nr_pages(folio));
|
||||
}
|
||||
+
|
||||
+ return collected;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2435,9 +2439,11 @@ static long
|
||||
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
|
||||
{
|
||||
LIST_HEAD(movable_folio_list);
|
||||
+ unsigned long collected;
|
||||
|
||||
- collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
|
||||
- if (list_empty(&movable_folio_list))
|
||||
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
|
||||
+ pofs);
|
||||
+ if (!collected)
|
||||
return 0;
|
||||
|
||||
return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
|
@@ -1,35 +0,0 @@
|
||||
From c161e0ffb55a12b9b26819fa0ecf8217ab781e97 Mon Sep 17 00:00:00 2001
|
||||
From: Zijun Hu <quic_zijuhu@quicinc.com>
|
||||
Date: Wed, 7 May 2025 19:50:26 +0800
|
||||
Subject: configfs: Do not override creating attribute file failure in
|
||||
populate_attrs()
|
||||
|
||||
populate_attrs() may override failure for creating attribute files
|
||||
by success for creating subsequent bin attribute files, and have
|
||||
wrong return value.
|
||||
|
||||
Fix by creating bin attribute files under successfully creating
|
||||
attribute files.
|
||||
|
||||
Fixes: 03607ace807b ("configfs: implement binary attributes")
|
||||
Cc: stable@vger.kernel.org
|
||||
Reviewed-by: Joel Becker <jlbec@evilplan.org>
|
||||
Reviewed-by: Breno Leitao <leitao@debian.org>
|
||||
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
|
||||
Link: https://lore.kernel.org/r/20250507-fix_configfs-v3-2-fe2d96de8dc4@quicinc.com
|
||||
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
|
||||
---
|
||||
fs/configfs/dir.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/configfs/dir.c
|
||||
+++ b/fs/configfs/dir.c
|
||||
@@ -619,7 +619,7 @@ static int populate_attrs(struct config_
|
||||
break;
|
||||
}
|
||||
}
|
||||
- if (t->ct_bin_attrs) {
|
||||
+ if (!error && t->ct_bin_attrs) {
|
||||
for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
|
||||
if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i))
|
||||
continue;
|
191
debian/patches/patchset-pf/fixes/0009-mm-userfaultfd-fix-race-of-userfaultfd_move-and-swap.patch
vendored
Normal file
191
debian/patches/patchset-pf/fixes/0009-mm-userfaultfd-fix-race-of-userfaultfd_move-and-swap.patch
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
From 7ebf89b788aa5b83897e99ad6e3dd6e0cb0f5030 Mon Sep 17 00:00:00 2001
|
||||
From: Kairui Song <kasong@tencent.com>
|
||||
Date: Wed, 4 Jun 2025 23:10:38 +0800
|
||||
Subject: mm: userfaultfd: fix race of userfaultfd_move and swap cache
|
||||
|
||||
This commit fixes two kinds of races, they may have different results:
|
||||
|
||||
Barry reported a BUG_ON in commit c50f8e6053b0, we may see the same
|
||||
BUG_ON if the filemap lookup returned NULL and folio is added to swap
|
||||
cache after that.
|
||||
|
||||
If another kind of race is triggered (folio changed after lookup) we
|
||||
may see RSS counter is corrupted:
|
||||
|
||||
[ 406.893936] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0
|
||||
type:MM_ANONPAGES val:-1
|
||||
[ 406.894071] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0
|
||||
type:MM_SHMEMPAGES val:1
|
||||
|
||||
Because the folio is being accounted to the wrong VMA.
|
||||
|
||||
I'm not sure if there will be any data corruption though, seems no.
|
||||
The issues above are critical already.
|
||||
|
||||
|
||||
On seeing a swap entry PTE, userfaultfd_move does a lockless swap cache
|
||||
lookup, and tries to move the found folio to the faulting vma. Currently,
|
||||
it relies on checking the PTE value to ensure that the moved folio still
|
||||
belongs to the src swap entry and that no new folio has been added to the
|
||||
swap cache, which turns out to be unreliable.
|
||||
|
||||
While working and reviewing the swap table series with Barry, following
|
||||
existing races are observed and reproduced [1]:
|
||||
|
||||
In the example below, move_pages_pte is moving src_pte to dst_pte, where
|
||||
src_pte is a swap entry PTE holding swap entry S1, and S1 is not in the
|
||||
swap cache:
|
||||
|
||||
CPU1 CPU2
|
||||
userfaultfd_move
|
||||
move_pages_pte()
|
||||
entry = pte_to_swp_entry(orig_src_pte);
|
||||
// Here it got entry = S1
|
||||
... < interrupted> ...
|
||||
<swapin src_pte, alloc and use folio A>
|
||||
// folio A is a new allocated folio
|
||||
// and get installed into src_pte
|
||||
<frees swap entry S1>
|
||||
// src_pte now points to folio A, S1
|
||||
// has swap count == 0, it can be freed
|
||||
// by folio_swap_swap or swap
|
||||
// allocator's reclaim.
|
||||
<try to swap out another folio B>
|
||||
// folio B is a folio in another VMA.
|
||||
<put folio B to swap cache using S1 >
|
||||
// S1 is freed, folio B can use it
|
||||
// for swap out with no problem.
|
||||
...
|
||||
folio = filemap_get_folio(S1)
|
||||
// Got folio B here !!!
|
||||
... < interrupted again> ...
|
||||
<swapin folio B and free S1>
|
||||
// Now S1 is free to be used again.
|
||||
<swapout src_pte & folio A using S1>
|
||||
// Now src_pte is a swap entry PTE
|
||||
// holding S1 again.
|
||||
folio_trylock(folio)
|
||||
move_swap_pte
|
||||
double_pt_lock
|
||||
is_pte_pages_stable
|
||||
// Check passed because src_pte == S1
|
||||
folio_move_anon_rmap(...)
|
||||
// Moved invalid folio B here !!!
|
||||
|
||||
The race window is very short and requires multiple collisions of multiple
|
||||
rare events, so it's very unlikely to happen, but with a deliberately
|
||||
constructed reproducer and increased time window, it can be reproduced
|
||||
easily.
|
||||
|
||||
This can be fixed by checking if the folio returned by filemap is the
|
||||
valid swap cache folio after acquiring the folio lock.
|
||||
|
||||
Another similar race is possible: filemap_get_folio may return NULL, but
|
||||
folio (A) could be swapped in and then swapped out again using the same
|
||||
swap entry after the lookup. In such a case, folio (A) may remain in the
|
||||
swap cache, so it must be moved too:
|
||||
|
||||
CPU1 CPU2
|
||||
userfaultfd_move
|
||||
move_pages_pte()
|
||||
entry = pte_to_swp_entry(orig_src_pte);
|
||||
// Here it got entry = S1, and S1 is not in swap cache
|
||||
folio = filemap_get_folio(S1)
|
||||
// Got NULL
|
||||
... < interrupted again> ...
|
||||
<swapin folio A and free S1>
|
||||
<swapout folio A re-using S1>
|
||||
move_swap_pte
|
||||
double_pt_lock
|
||||
is_pte_pages_stable
|
||||
// Check passed because src_pte == S1
|
||||
folio_move_anon_rmap(...)
|
||||
// folio A is ignored !!!
|
||||
|
||||
Fix this by checking the swap cache again after acquiring the src_pte
|
||||
lock. And to avoid the filemap overhead, we check swap_map directly [2].
|
||||
|
||||
The SWP_SYNCHRONOUS_IO path does make the problem more complex, but so far
|
||||
we don't need to worry about that, since folios can only be exposed to the
|
||||
swap cache in the swap out path, and this is covered in this patch by
|
||||
checking the swap cache again after acquiring the src_pte lock.
|
||||
|
||||
Testing with a simple C program that allocates and moves several GB of
|
||||
memory did not show any observable performance change.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250604151038.21968-1-ryncsn@gmail.com
|
||||
Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
|
||||
Signed-off-by: Kairui Song <kasong@tencent.com>
|
||||
Closes: https://lore.kernel.org/linux-mm/CAMgjq7B1K=6OOrK2OUZ0-tqCzi+EJt+2_K97TPGoSt=9+JwP7Q@mail.gmail.com/ [1]
|
||||
Link: https://lore.kernel.org/all/CAGsJ_4yJhJBo16XhiC-nUzSheyX-V3-nFE+tAi=8Y560K8eT=A@mail.gmail.com/ [2]
|
||||
Reviewed-by: Lokesh Gidra <lokeshgidra@google.com>
|
||||
Acked-by: Peter Xu <peterx@redhat.com>
|
||||
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Reviewed-by: Chris Li <chrisl@kernel.org>
|
||||
Cc: Andrea Arcangeli <aarcange@redhat.com>
|
||||
Cc: David Hildenbrand <david@redhat.com>
|
||||
Cc: Kairui Song <kasong@tencent.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/userfaultfd.c | 33 +++++++++++++++++++++++++++++++--
|
||||
1 file changed, 31 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/mm/userfaultfd.c
|
||||
+++ b/mm/userfaultfd.c
|
||||
@@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struc
|
||||
pte_t orig_dst_pte, pte_t orig_src_pte,
|
||||
pmd_t *dst_pmd, pmd_t dst_pmdval,
|
||||
spinlock_t *dst_ptl, spinlock_t *src_ptl,
|
||||
- struct folio *src_folio)
|
||||
+ struct folio *src_folio,
|
||||
+ struct swap_info_struct *si, swp_entry_t entry)
|
||||
{
|
||||
+ /*
|
||||
+ * Check if the folio still belongs to the target swap entry after
|
||||
+ * acquiring the lock. Folio can be freed in the swap cache while
|
||||
+ * not locked.
|
||||
+ */
|
||||
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
|
||||
+ entry.val != src_folio->swap.val))
|
||||
+ return -EAGAIN;
|
||||
+
|
||||
double_pt_lock(dst_ptl, src_ptl);
|
||||
|
||||
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
|
||||
@@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struc
|
||||
if (src_folio) {
|
||||
folio_move_anon_rmap(src_folio, dst_vma);
|
||||
src_folio->index = linear_page_index(dst_vma, dst_addr);
|
||||
+ } else {
|
||||
+ /*
|
||||
+ * Check if the swap entry is cached after acquiring the src_pte
|
||||
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
|
||||
+ *
|
||||
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
|
||||
+ * We are trying to catch newly added swap cache, the only possible case is
|
||||
+ * when a folio is swapped in and out again staying in swap cache, using the
|
||||
+ * same entry before the PTE check above. The PTL is acquired and released
|
||||
+ * twice, each time after updating the swap_map's flag. So holding
|
||||
+ * the PTL here ensures we see the updated value. False positive is possible,
|
||||
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
|
||||
+ * cache, or during the tiny synchronization window between swap cache and
|
||||
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
|
||||
+ */
|
||||
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
|
||||
+ double_pt_unlock(dst_ptl, src_ptl);
|
||||
+ return -EAGAIN;
|
||||
+ }
|
||||
}
|
||||
|
||||
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
|
||||
@@ -1412,7 +1441,7 @@ retry:
|
||||
}
|
||||
err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
|
||||
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
|
||||
- dst_ptl, src_ptl, src_folio);
|
||||
+ dst_ptl, src_ptl, src_folio, si, entry);
|
||||
}
|
||||
|
||||
out:
|
26
debian/patches/patchset-pf/fixes/0010-dm-raid-fix-variable-in-journal-device-check.patch
vendored
Normal file
26
debian/patches/patchset-pf/fixes/0010-dm-raid-fix-variable-in-journal-device-check.patch
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
From 222985dcb732fae554af5276f44c30d648a1d05b Mon Sep 17 00:00:00 2001
|
||||
From: Heinz Mauelshagen <heinzm@redhat.com>
|
||||
Date: Tue, 10 Jun 2025 20:53:30 +0200
|
||||
Subject: dm-raid: fix variable in journal device check
|
||||
|
||||
Replace "rdev" with correct loop variable name "r".
|
||||
|
||||
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Fixes: 63c32ed4afc2 ("dm raid: add raid4/5/6 journaling support")
|
||||
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
|
||||
---
|
||||
drivers/md/dm-raid.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/md/dm-raid.c
|
||||
+++ b/drivers/md/dm-raid.c
|
||||
@@ -2410,7 +2410,7 @@ static int super_init_validation(struct
|
||||
*/
|
||||
sb_retrieve_failed_devices(sb, failed_devices);
|
||||
rdev_for_each(r, mddev) {
|
||||
- if (test_bit(Journal, &rdev->flags) ||
|
||||
+ if (test_bit(Journal, &r->flags) ||
|
||||
!r->sb_page)
|
||||
continue;
|
||||
sb2 = page_address(r->sb_page);
|
@@ -1,129 +0,0 @@
|
||||
From 1e9a258def978a9388a50ae43c85557b0598a7d3 Mon Sep 17 00:00:00 2001
|
||||
From: Pu Lehui <pulehui@huawei.com>
|
||||
Date: Thu, 29 May 2025 15:56:47 +0000
|
||||
Subject: mm: fix uprobe pte be overwritten when expanding vma
|
||||
|
||||
Patch series "Fix uprobe pte be overwritten when expanding vma".
|
||||
|
||||
|
||||
This patch (of 4):
|
||||
|
||||
We encountered a BUG alert triggered by Syzkaller as follows:
|
||||
BUG: Bad rss-counter state mm:00000000b4a60fca type:MM_ANONPAGES val:1
|
||||
|
||||
And we can reproduce it with the following steps:
|
||||
1. register uprobe on file at zero offset
|
||||
2. mmap the file at zero offset:
|
||||
addr1 = mmap(NULL, 2 * 4096, PROT_NONE, MAP_PRIVATE, fd, 0);
|
||||
3. mremap part of vma1 to new vma2:
|
||||
addr2 = mremap(addr1, 4096, 2 * 4096, MREMAP_MAYMOVE);
|
||||
4. mremap back to orig addr1:
|
||||
mremap(addr2, 4096, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, addr1);
|
||||
|
||||
In step 3, the vma1 range [addr1, addr1 + 4096] will be remap to new vma2
|
||||
with range [addr2, addr2 + 8192], and remap uprobe anon page from the vma1
|
||||
to vma2, then unmap the vma1 range [addr1, addr1 + 4096].
|
||||
|
||||
In step 4, the vma2 range [addr2, addr2 + 4096] will be remap back to the
|
||||
addr range [addr1, addr1 + 4096]. Since the addr range [addr1 + 4096,
|
||||
addr1 + 8192] still maps the file, it will take vma_merge_new_range to
|
||||
expand the range, and then do uprobe_mmap in vma_complete. Since the
|
||||
merged vma pgoff is also zero offset, it will install uprobe anon page to
|
||||
the merged vma. However, the upcomming move_page_tables step, which use
|
||||
set_pte_at to remap the vma2 uprobe pte to the merged vma, will overwrite
|
||||
the newly uprobe pte in the merged vma, and lead that pte to be orphan.
|
||||
|
||||
Since the uprobe pte will be remapped to the merged vma, we can remove the
|
||||
unnecessary uprobe_mmap upon merged vma.
|
||||
|
||||
This problem was first found in linux-6.6.y and also exists in the
|
||||
community syzkaller:
|
||||
https://lore.kernel.org/all/000000000000ada39605a5e71711@google.com/T/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250529155650.4017699-1-pulehui@huaweicloud.com
|
||||
Link: https://lkml.kernel.org/r/20250529155650.4017699-2-pulehui@huaweicloud.com
|
||||
Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
|
||||
Signed-off-by: Pu Lehui <pulehui@huawei.com>
|
||||
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Acked-by: David Hildenbrand <david@redhat.com>
|
||||
Cc: Jann Horn <jannh@google.com>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
|
||||
Cc: Oleg Nesterov <oleg@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vma.c | 20 +++++++++++++++++---
|
||||
mm/vma.h | 7 +++++++
|
||||
2 files changed, 24 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/mm/vma.c
|
||||
+++ b/mm/vma.c
|
||||
@@ -144,6 +144,9 @@ static void init_multi_vma_prep(struct v
|
||||
vp->file = vma->vm_file;
|
||||
if (vp->file)
|
||||
vp->mapping = vma->vm_file->f_mapping;
|
||||
+
|
||||
+ if (vmg && vmg->skip_vma_uprobe)
|
||||
+ vp->skip_vma_uprobe = true;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -333,10 +336,13 @@ static void vma_complete(struct vma_prep
|
||||
|
||||
if (vp->file) {
|
||||
i_mmap_unlock_write(vp->mapping);
|
||||
- uprobe_mmap(vp->vma);
|
||||
|
||||
- if (vp->adj_next)
|
||||
- uprobe_mmap(vp->adj_next);
|
||||
+ if (!vp->skip_vma_uprobe) {
|
||||
+ uprobe_mmap(vp->vma);
|
||||
+
|
||||
+ if (vp->adj_next)
|
||||
+ uprobe_mmap(vp->adj_next);
|
||||
+ }
|
||||
}
|
||||
|
||||
if (vp->remove) {
|
||||
@@ -1783,6 +1789,14 @@ struct vm_area_struct *copy_vma(struct v
|
||||
faulted_in_anon_vma = false;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * If the VMA we are copying might contain a uprobe PTE, ensure
|
||||
+ * that we do not establish one upon merge. Otherwise, when mremap()
|
||||
+ * moves page tables, it will orphan the newly created PTE.
|
||||
+ */
|
||||
+ if (vma->vm_file)
|
||||
+ vmg.skip_vma_uprobe = true;
|
||||
+
|
||||
new_vma = find_vma_prev(mm, addr, &vmg.prev);
|
||||
if (new_vma && new_vma->vm_start < addr + len)
|
||||
return NULL; /* should never get here */
|
||||
--- a/mm/vma.h
|
||||
+++ b/mm/vma.h
|
||||
@@ -19,6 +19,8 @@ struct vma_prepare {
|
||||
struct vm_area_struct *insert;
|
||||
struct vm_area_struct *remove;
|
||||
struct vm_area_struct *remove2;
|
||||
+
|
||||
+ bool skip_vma_uprobe :1;
|
||||
};
|
||||
|
||||
struct unlink_vma_file_batch {
|
||||
@@ -120,6 +122,11 @@ struct vma_merge_struct {
|
||||
*/
|
||||
bool give_up_on_oom :1;
|
||||
|
||||
+ /*
|
||||
+ * If set, skip uprobe_mmap upon merged vma.
|
||||
+ */
|
||||
+ bool skip_vma_uprobe :1;
|
||||
+
|
||||
/* Internal flags set during merge process: */
|
||||
|
||||
/*
|
@@ -1,217 +0,0 @@
|
||||
From 2d8c79ec421253aab9560a47a7e73d678c84585c Mon Sep 17 00:00:00 2001
|
||||
From: Jann Horn <jannh@google.com>
|
||||
Date: Tue, 27 May 2025 23:23:53 +0200
|
||||
Subject: mm/hugetlb: unshare page tables during VMA split, not before
|
||||
|
||||
Currently, __split_vma() triggers hugetlb page table unsharing through
|
||||
vm_ops->may_split(). This happens before the VMA lock and rmap locks are
|
||||
taken - which is too early, it allows racing VMA-locked page faults in our
|
||||
process and racing rmap walks from other processes to cause page tables to
|
||||
be shared again before we actually perform the split.
|
||||
|
||||
Fix it by explicitly calling into the hugetlb unshare logic from
|
||||
__split_vma() in the same place where THP splitting also happens. At that
|
||||
point, both the VMA and the rmap(s) are write-locked.
|
||||
|
||||
An annoying detail is that we can now call into the helper
|
||||
hugetlb_unshare_pmds() from two different locking contexts:
|
||||
|
||||
1. from hugetlb_split(), holding:
|
||||
- mmap lock (exclusively)
|
||||
- VMA lock
|
||||
- file rmap lock (exclusively)
|
||||
2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
|
||||
call us with only the mmap lock held (in shared mode), but currently
|
||||
only runs while holding mmap lock (exclusively) and VMA lock
|
||||
|
||||
Backporting note:
|
||||
This commit fixes a racy protection that was introduced in commit
|
||||
b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
|
||||
commit claimed to fix an issue introduced in 5.13, but it should actually
|
||||
also go all the way back.
|
||||
|
||||
[jannh@google.com: v2]
|
||||
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
|
||||
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
|
||||
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
|
||||
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
|
||||
Signed-off-by: Jann Horn <jannh@google.com>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Reviewed-by: Oscar Salvador <osalvador@suse.de>
|
||||
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org> [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/hugetlb.h | 3 ++
|
||||
mm/hugetlb.c | 60 +++++++++++++++++++++++---------
|
||||
mm/vma.c | 7 ++++
|
||||
tools/testing/vma/vma_internal.h | 2 ++
|
||||
4 files changed, 56 insertions(+), 16 deletions(-)
|
||||
|
||||
--- a/include/linux/hugetlb.h
|
||||
+++ b/include/linux/hugetlb.h
|
||||
@@ -276,6 +276,7 @@ bool is_hugetlb_entry_migration(pte_t pt
|
||||
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
|
||||
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
|
||||
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
|
||||
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
|
||||
|
||||
#else /* !CONFIG_HUGETLB_PAGE */
|
||||
|
||||
@@ -473,6 +474,8 @@ static inline void fixup_hugetlb_reserva
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
|
||||
+
|
||||
#endif /* !CONFIG_HUGETLB_PAGE */
|
||||
|
||||
#ifndef pgd_write
|
||||
--- a/mm/hugetlb.c
|
||||
+++ b/mm/hugetlb.c
|
||||
@@ -120,7 +120,7 @@ static void hugetlb_vma_lock_free(struct
|
||||
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
|
||||
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
|
||||
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
|
||||
- unsigned long start, unsigned long end);
|
||||
+ unsigned long start, unsigned long end, bool take_locks);
|
||||
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
|
||||
|
||||
static void hugetlb_free_folio(struct folio *folio)
|
||||
@@ -5426,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm
|
||||
{
|
||||
if (addr & ~(huge_page_mask(hstate_vma(vma))))
|
||||
return -EINVAL;
|
||||
+ return 0;
|
||||
+}
|
||||
|
||||
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
|
||||
+{
|
||||
/*
|
||||
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
|
||||
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
|
||||
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
|
||||
+ * This function is called in the middle of a VMA split operation, with
|
||||
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
|
||||
+ * walks (except hardware and gup_fast()).
|
||||
*/
|
||||
+ vma_assert_write_locked(vma);
|
||||
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
|
||||
+
|
||||
if (addr & ~PUD_MASK) {
|
||||
- /*
|
||||
- * hugetlb_vm_op_split is called right before we attempt to
|
||||
- * split the VMA. We will need to unshare PMDs in the old and
|
||||
- * new VMAs, so let's unshare before we split.
|
||||
- */
|
||||
unsigned long floor = addr & PUD_MASK;
|
||||
unsigned long ceil = floor + PUD_SIZE;
|
||||
|
||||
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
|
||||
- hugetlb_unshare_pmds(vma, floor, ceil);
|
||||
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
|
||||
+ /*
|
||||
+ * Locking:
|
||||
+ * Use take_locks=false here.
|
||||
+ * The file rmap lock is already held.
|
||||
+ * The hugetlb VMA lock can't be taken when we already
|
||||
+ * hold the file rmap lock, and we don't need it because
|
||||
+ * its purpose is to synchronize against concurrent page
|
||||
+ * table walks, which are not possible thanks to the
|
||||
+ * locks held by our caller.
|
||||
+ */
|
||||
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
|
||||
+ }
|
||||
}
|
||||
-
|
||||
- return 0;
|
||||
}
|
||||
|
||||
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
|
||||
@@ -7884,9 +7898,16 @@ void move_hugetlb_state(struct folio *ol
|
||||
spin_unlock_irq(&hugetlb_lock);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * If @take_locks is false, the caller must ensure that no concurrent page table
|
||||
+ * access can happen (except for gup_fast() and hardware page walks).
|
||||
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
|
||||
+ * concurrent page fault handling) and the file rmap lock.
|
||||
+ */
|
||||
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
|
||||
unsigned long start,
|
||||
- unsigned long end)
|
||||
+ unsigned long end,
|
||||
+ bool take_locks)
|
||||
{
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
unsigned long sz = huge_page_size(h);
|
||||
@@ -7910,8 +7931,12 @@ static void hugetlb_unshare_pmds(struct
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
|
||||
start, end);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
- hugetlb_vma_lock_write(vma);
|
||||
- i_mmap_lock_write(vma->vm_file->f_mapping);
|
||||
+ if (take_locks) {
|
||||
+ hugetlb_vma_lock_write(vma);
|
||||
+ i_mmap_lock_write(vma->vm_file->f_mapping);
|
||||
+ } else {
|
||||
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
|
||||
+ }
|
||||
for (address = start; address < end; address += PUD_SIZE) {
|
||||
ptep = hugetlb_walk(vma, address, sz);
|
||||
if (!ptep)
|
||||
@@ -7921,8 +7946,10 @@ static void hugetlb_unshare_pmds(struct
|
||||
spin_unlock(ptl);
|
||||
}
|
||||
flush_hugetlb_tlb_range(vma, start, end);
|
||||
- i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||
- hugetlb_vma_unlock_write(vma);
|
||||
+ if (take_locks) {
|
||||
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||
+ hugetlb_vma_unlock_write(vma);
|
||||
+ }
|
||||
/*
|
||||
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
|
||||
* Documentation/mm/mmu_notifier.rst.
|
||||
@@ -7937,7 +7964,8 @@ static void hugetlb_unshare_pmds(struct
|
||||
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
|
||||
{
|
||||
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
|
||||
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
|
||||
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
|
||||
+ /* take_locks = */ true);
|
||||
}
|
||||
|
||||
/*
|
||||
--- a/mm/vma.c
|
||||
+++ b/mm/vma.c
|
||||
@@ -516,7 +516,14 @@ __split_vma(struct vma_iterator *vmi, st
|
||||
init_vma_prep(&vp, vma);
|
||||
vp.insert = new;
|
||||
vma_prepare(&vp);
|
||||
+
|
||||
+ /*
|
||||
+ * Get rid of huge pages and shared page tables straddling the split
|
||||
+ * boundary.
|
||||
+ */
|
||||
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
|
||||
+ if (is_vm_hugetlb_page(vma))
|
||||
+ hugetlb_split(vma, addr);
|
||||
|
||||
if (new_below) {
|
||||
vma->vm_start = addr;
|
||||
--- a/tools/testing/vma/vma_internal.h
|
||||
+++ b/tools/testing/vma/vma_internal.h
|
||||
@@ -793,6 +793,8 @@ static inline void vma_adjust_trans_huge
|
||||
(void)next;
|
||||
}
|
||||
|
||||
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
|
||||
+
|
||||
static inline void vma_iter_free(struct vma_iterator *vmi)
|
||||
{
|
||||
mas_destroy(&vmi->mas);
|
@@ -1,50 +0,0 @@
|
||||
From e1280358284feaf844db5c6a76078b2c1738c5ae Mon Sep 17 00:00:00 2001
|
||||
From: Jann Horn <jannh@google.com>
|
||||
Date: Tue, 27 May 2025 23:23:54 +0200
|
||||
Subject: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race
|
||||
|
||||
huge_pmd_unshare() drops a reference on a page table that may have
|
||||
previously been shared across processes, potentially turning it into a
|
||||
normal page table used in another process in which unrelated VMAs can
|
||||
afterwards be installed.
|
||||
|
||||
If this happens in the middle of a concurrent gup_fast(), gup_fast() could
|
||||
end up walking the page tables of another process. While I don't see any
|
||||
way in which that immediately leads to kernel memory corruption, it is
|
||||
really weird and unexpected.
|
||||
|
||||
Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(),
|
||||
just like we do in khugepaged when removing page tables for a THP
|
||||
collapse.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-2-1329349bad1a@google.com
|
||||
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-2-f4136f5ec58a@google.com
|
||||
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
|
||||
Signed-off-by: Jann Horn <jannh@google.com>
|
||||
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Cc: Muchun Song <muchun.song@linux.dev>
|
||||
Cc: Oscar Salvador <osalvador@suse.de>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/hugetlb.c | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
--- a/mm/hugetlb.c
|
||||
+++ b/mm/hugetlb.c
|
||||
@@ -7628,6 +7628,13 @@ int huge_pmd_unshare(struct mm_struct *m
|
||||
return 0;
|
||||
|
||||
pud_clear(pud);
|
||||
+ /*
|
||||
+ * Once our caller drops the rmap lock, some other process might be
|
||||
+ * using this page table as a normal, non-hugetlb page table.
|
||||
+ * Wait for pending gup_fast() in other threads to finish before letting
|
||||
+ * that happen.
|
||||
+ */
|
||||
+ tlb_remove_table_sync_one();
|
||||
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
|
||||
mm_dec_nr_pmds(mm);
|
||||
return 1;
|
@@ -1,48 +0,0 @@
|
||||
From b36611870ea72c82eb78d90a017658394bdb9690 Mon Sep 17 00:00:00 2001
|
||||
From: SeongJae Park <sj@kernel.org>
|
||||
Date: Mon, 2 Jun 2025 10:49:26 -0700
|
||||
Subject: mm/madvise: handle madvise_lock() failure during race unwinding
|
||||
|
||||
When unwinding race on -ERESTARTNOINTR handling of process_madvise(),
|
||||
madvise_lock() failure is ignored. Check the failure and abort remaining
|
||||
works in the case.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250602174926.1074-1-sj@kernel.org
|
||||
Fixes: 4000e3d0a367 ("mm/madvise: remove redundant mmap_lock operations from process_madvise()")
|
||||
Signed-off-by: SeongJae Park <sj@kernel.org>
|
||||
Reported-by: Barry Song <21cnbao@gmail.com>
|
||||
Closes: https://lore.kernel.org/CAGsJ_4xJXXO0G+4BizhohSZ4yDteziPw43_uF8nPXPWxUVChzw@mail.gmail.com
|
||||
Reviewed-by: Jann Horn <jannh@google.com>
|
||||
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Acked-by: David Hildenbrand <david@redhat.com>
|
||||
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/madvise.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/madvise.c
|
||||
+++ b/mm/madvise.c
|
||||
@@ -1830,7 +1830,9 @@ static ssize_t vector_madvise(struct mm_
|
||||
|
||||
/* Drop and reacquire lock to unwind race. */
|
||||
madvise_unlock(mm, behavior);
|
||||
- madvise_lock(mm, behavior);
|
||||
+ ret = madvise_lock(mm, behavior);
|
||||
+ if (ret)
|
||||
+ goto out;
|
||||
continue;
|
||||
}
|
||||
if (ret < 0)
|
||||
@@ -1839,6 +1841,7 @@ static ssize_t vector_madvise(struct mm_
|
||||
}
|
||||
madvise_unlock(mm, behavior);
|
||||
|
||||
+out:
|
||||
ret = (total_len - iov_iter_count(iter)) ? : ret;
|
||||
|
||||
return ret;
|
@@ -1,164 +0,0 @@
|
||||
From f0ab226d0eae3aa7e26524efc040026a65ead640 Mon Sep 17 00:00:00 2001
|
||||
From: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Date: Wed, 28 May 2025 10:02:08 +0200
|
||||
Subject: video: screen_info: Relocate framebuffers behind PCI bridges
|
||||
|
||||
Apply PCI host-bridge window offsets to screen_info framebuffers. Fixes
|
||||
invalid access to I/O memory.
|
||||
|
||||
Resources behind a PCI host bridge can be relocated by a certain offset
|
||||
in the kernel's CPU address range used for I/O. The framebuffer memory
|
||||
range stored in screen_info refers to the CPU addresses as seen during
|
||||
boot (where the offset is 0). During boot up, firmware may assign a
|
||||
different memory offset to the PCI host bridge and thereby relocating
|
||||
the framebuffer address of the PCI graphics device as seen by the kernel.
|
||||
The information in screen_info must be updated as well.
|
||||
|
||||
The helper pcibios_bus_to_resource() performs the relocation of the
|
||||
screen_info's framebuffer resource (given in PCI bus addresses). The
|
||||
result matches the I/O-memory resource of the PCI graphics device (given
|
||||
in CPU addresses). As before, we store away the information necessary to
|
||||
later update the information in screen_info itself.
|
||||
|
||||
Commit 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated
|
||||
EFI framebuffers") added the code for updating screen_info. It is based
|
||||
on similar functionality that pre-existed in efifb. Efifb uses a pointer
|
||||
to the PCI resource, while the newer code does a memcpy of the region.
|
||||
Hence efifb sees any updates to the PCI resource and avoids the issue.
|
||||
|
||||
v3:
|
||||
- Only use struct pci_bus_region for PCI bus addresses (Bjorn)
|
||||
- Clarify address semantics in commit messages and comments (Bjorn)
|
||||
v2:
|
||||
- Fixed tags (Takashi, Ivan)
|
||||
- Updated information on efifb
|
||||
|
||||
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
|
||||
Reported-by: "Ivan T. Ivanov" <iivanov@suse.de>
|
||||
Closes: https://bugzilla.suse.com/show_bug.cgi?id=1240696
|
||||
Tested-by: "Ivan T. Ivanov" <iivanov@suse.de>
|
||||
Fixes: 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated EFI framebuffers")
|
||||
Cc: dri-devel@lists.freedesktop.org
|
||||
Cc: <stable@vger.kernel.org> # v6.9+
|
||||
Link: https://lore.kernel.org/r/20250528080234.7380-1-tzimmermann@suse.de
|
||||
---
|
||||
drivers/video/screen_info_pci.c | 79 +++++++++++++++++++++------------
|
||||
1 file changed, 50 insertions(+), 29 deletions(-)
|
||||
|
||||
--- a/drivers/video/screen_info_pci.c
|
||||
+++ b/drivers/video/screen_info_pci.c
|
||||
@@ -7,8 +7,8 @@
|
||||
|
||||
static struct pci_dev *screen_info_lfb_pdev;
|
||||
static size_t screen_info_lfb_bar;
|
||||
-static resource_size_t screen_info_lfb_offset;
|
||||
-static struct resource screen_info_lfb_res = DEFINE_RES_MEM(0, 0);
|
||||
+static resource_size_t screen_info_lfb_res_start; // original start of resource
|
||||
+static resource_size_t screen_info_lfb_offset; // framebuffer offset within resource
|
||||
|
||||
static bool __screen_info_relocation_is_valid(const struct screen_info *si, struct resource *pr)
|
||||
{
|
||||
@@ -31,7 +31,7 @@ void screen_info_apply_fixups(void)
|
||||
if (screen_info_lfb_pdev) {
|
||||
struct resource *pr = &screen_info_lfb_pdev->resource[screen_info_lfb_bar];
|
||||
|
||||
- if (pr->start != screen_info_lfb_res.start) {
|
||||
+ if (pr->start != screen_info_lfb_res_start) {
|
||||
if (__screen_info_relocation_is_valid(si, pr)) {
|
||||
/*
|
||||
* Only update base if we have an actual
|
||||
@@ -47,46 +47,67 @@ void screen_info_apply_fixups(void)
|
||||
}
|
||||
}
|
||||
|
||||
+static int __screen_info_lfb_pci_bus_region(const struct screen_info *si, unsigned int type,
|
||||
+ struct pci_bus_region *r)
|
||||
+{
|
||||
+ u64 base, size;
|
||||
+
|
||||
+ base = __screen_info_lfb_base(si);
|
||||
+ if (!base)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ size = __screen_info_lfb_size(si, type);
|
||||
+ if (!size)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ r->start = base;
|
||||
+ r->end = base + size - 1;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static void screen_info_fixup_lfb(struct pci_dev *pdev)
|
||||
{
|
||||
unsigned int type;
|
||||
- struct resource res[SCREEN_INFO_MAX_RESOURCES];
|
||||
- size_t i, numres;
|
||||
+ struct pci_bus_region bus_region;
|
||||
int ret;
|
||||
+ struct resource r = {
|
||||
+ .flags = IORESOURCE_MEM,
|
||||
+ };
|
||||
+ const struct resource *pr;
|
||||
const struct screen_info *si = &screen_info;
|
||||
|
||||
if (screen_info_lfb_pdev)
|
||||
return; // already found
|
||||
|
||||
type = screen_info_video_type(si);
|
||||
- if (type != VIDEO_TYPE_EFI)
|
||||
- return; // only applies to EFI
|
||||
+ if (!__screen_info_has_lfb(type))
|
||||
+ return; // only applies to EFI; maybe VESA
|
||||
|
||||
- ret = screen_info_resources(si, res, ARRAY_SIZE(res));
|
||||
+ ret = __screen_info_lfb_pci_bus_region(si, type, &bus_region);
|
||||
if (ret < 0)
|
||||
return;
|
||||
- numres = ret;
|
||||
|
||||
- for (i = 0; i < numres; ++i) {
|
||||
- struct resource *r = &res[i];
|
||||
- const struct resource *pr;
|
||||
-
|
||||
- if (!(r->flags & IORESOURCE_MEM))
|
||||
- continue;
|
||||
- pr = pci_find_resource(pdev, r);
|
||||
- if (!pr)
|
||||
- continue;
|
||||
-
|
||||
- /*
|
||||
- * We've found a PCI device with the framebuffer
|
||||
- * resource. Store away the parameters to track
|
||||
- * relocation of the framebuffer aperture.
|
||||
- */
|
||||
- screen_info_lfb_pdev = pdev;
|
||||
- screen_info_lfb_bar = pr - pdev->resource;
|
||||
- screen_info_lfb_offset = r->start - pr->start;
|
||||
- memcpy(&screen_info_lfb_res, r, sizeof(screen_info_lfb_res));
|
||||
- }
|
||||
+ /*
|
||||
+ * Translate the PCI bus address to resource. Account
|
||||
+ * for an offset if the framebuffer is behind a PCI host
|
||||
+ * bridge.
|
||||
+ */
|
||||
+ pcibios_bus_to_resource(pdev->bus, &r, &bus_region);
|
||||
+
|
||||
+ pr = pci_find_resource(pdev, &r);
|
||||
+ if (!pr)
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * We've found a PCI device with the framebuffer
|
||||
+ * resource. Store away the parameters to track
|
||||
+ * relocation of the framebuffer aperture.
|
||||
+ */
|
||||
+ screen_info_lfb_pdev = pdev;
|
||||
+ screen_info_lfb_bar = pr - pdev->resource;
|
||||
+ screen_info_lfb_offset = r.start - pr->start;
|
||||
+ screen_info_lfb_res_start = bus_region.start;
|
||||
}
|
||||
DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID, PCI_BASE_CLASS_DISPLAY, 16,
|
||||
screen_info_fixup_lfb);
|
@@ -1,86 +0,0 @@
|
||||
From 717bcb42b8cd4119c88249fbfc26d08e25a2ca24 Mon Sep 17 00:00:00 2001
|
||||
From: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Date: Tue, 3 Jun 2025 17:48:20 +0200
|
||||
Subject: sysfb: Fix screen_info type check for VGA
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Use the helper screen_info_video_type() to get the framebuffer
|
||||
type from struct screen_info. Handle supported values in sorted
|
||||
switch statement.
|
||||
|
||||
Reading orig_video_isVGA is unreliable. On most systems it is a
|
||||
VIDEO_TYPE_ constant. On some systems with VGA it is simply set
|
||||
to 1 to signal the presence of a VGA output. See vga_probe() for
|
||||
an example. Retrieving the screen_info type with the helper
|
||||
screen_info_video_type() detects these cases and returns the
|
||||
appropriate VIDEO_TYPE_ constant. For VGA, sysfb creates a device
|
||||
named "vga-framebuffer".
|
||||
|
||||
The sysfb code has been taken from vga16fb, where it likely didn't
|
||||
work correctly either. With this bugfix applied, vga16fb loads for
|
||||
compatible vga-framebuffer devices.
|
||||
|
||||
Fixes: 0db5b61e0dc0 ("fbdev/vga16fb: Create EGA/VGA devices in sysfb code")
|
||||
Cc: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Cc: Javier Martinez Canillas <javierm@redhat.com>
|
||||
Cc: Alex Deucher <alexander.deucher@amd.com>
|
||||
Cc: Tzung-Bi Shih <tzungbi@kernel.org>
|
||||
Cc: Helge Deller <deller@gmx.de>
|
||||
Cc: "Uwe Kleine-König" <u.kleine-koenig@baylibre.com>
|
||||
Cc: Zsolt Kajtar <soci@c64.rulez.org>
|
||||
Cc: <stable@vger.kernel.org> # v6.1+
|
||||
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
|
||||
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
|
||||
Link: https://lore.kernel.org/r/20250603154838.401882-1-tzimmermann@suse.de
|
||||
---
|
||||
drivers/firmware/sysfb.c | 26 ++++++++++++++++++--------
|
||||
1 file changed, 18 insertions(+), 8 deletions(-)
|
||||
|
||||
--- a/drivers/firmware/sysfb.c
|
||||
+++ b/drivers/firmware/sysfb.c
|
||||
@@ -143,6 +143,7 @@ static __init int sysfb_init(void)
|
||||
{
|
||||
struct screen_info *si = &screen_info;
|
||||
struct device *parent;
|
||||
+ unsigned int type;
|
||||
struct simplefb_platform_data mode;
|
||||
const char *name;
|
||||
bool compatible;
|
||||
@@ -170,17 +171,26 @@ static __init int sysfb_init(void)
|
||||
goto put_device;
|
||||
}
|
||||
|
||||
+ type = screen_info_video_type(si);
|
||||
+
|
||||
/* if the FB is incompatible, create a legacy framebuffer device */
|
||||
- if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
|
||||
- name = "efi-framebuffer";
|
||||
- else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
|
||||
- name = "vesa-framebuffer";
|
||||
- else if (si->orig_video_isVGA == VIDEO_TYPE_VGAC)
|
||||
- name = "vga-framebuffer";
|
||||
- else if (si->orig_video_isVGA == VIDEO_TYPE_EGAC)
|
||||
+ switch (type) {
|
||||
+ case VIDEO_TYPE_EGAC:
|
||||
name = "ega-framebuffer";
|
||||
- else
|
||||
+ break;
|
||||
+ case VIDEO_TYPE_VGAC:
|
||||
+ name = "vga-framebuffer";
|
||||
+ break;
|
||||
+ case VIDEO_TYPE_VLFB:
|
||||
+ name = "vesa-framebuffer";
|
||||
+ break;
|
||||
+ case VIDEO_TYPE_EFI:
|
||||
+ name = "efi-framebuffer";
|
||||
+ break;
|
||||
+ default:
|
||||
name = "platform-framebuffer";
|
||||
+ break;
|
||||
+ }
|
||||
|
||||
pd = platform_device_alloc(name, 0);
|
||||
if (!pd) {
|
@@ -1,200 +0,0 @@
|
||||
From 08b1e02fc44abc04d813dbc827812db9ebca0dad Mon Sep 17 00:00:00 2001
|
||||
From: Luo Gengkun <luogengkun@huaweicloud.com>
|
||||
Date: Mon, 21 Apr 2025 03:50:21 +0000
|
||||
Subject: watchdog: fix watchdog may detect false positive of softlockup
|
||||
|
||||
When updating `watchdog_thresh`, there is a race condition between writing
|
||||
the new `watchdog_thresh` value and stopping the old watchdog timer. If
|
||||
the old timer triggers during this window, it may falsely detect a
|
||||
softlockup due to the old interval and the new `watchdog_thresh` value
|
||||
being used. The problem can be described as follow:
|
||||
|
||||
# We asuume previous watchdog_thresh is 60, so the watchdog timer is
|
||||
# coming every 24s.
|
||||
echo 10 > /proc/sys/kernel/watchdog_thresh (User space)
|
||||
|
|
||||
+------>+ update watchdog_thresh (We are in kernel now)
|
||||
|
|
||||
| # using old interval and new `watchdog_thresh`
|
||||
+------>+ watchdog hrtimer (irq context: detect softlockup)
|
||||
|
|
||||
|
|
||||
+-------+
|
||||
|
|
||||
|
|
||||
+ softlockup_stop_all
|
||||
|
||||
To fix this problem, introduce a shadow variable for `watchdog_thresh`.
|
||||
The update to the actual `watchdog_thresh` is delayed until after the old
|
||||
timer is stopped, preventing false positives.
|
||||
|
||||
The following testcase may help to understand this problem.
|
||||
|
||||
---------------------------------------------
|
||||
echo RT_RUNTIME_SHARE > /sys/kernel/debug/sched/features
|
||||
echo -1 > /proc/sys/kernel/sched_rt_runtime_us
|
||||
echo 0 > /sys/kernel/debug/sched/fair_server/cpu3/runtime
|
||||
echo 60 > /proc/sys/kernel/watchdog_thresh
|
||||
taskset -c 3 chrt -r 99 /bin/bash -c "while true;do true; done" &
|
||||
echo 10 > /proc/sys/kernel/watchdog_thresh &
|
||||
---------------------------------------------
|
||||
|
||||
The test case above first removes the throttling restrictions for
|
||||
real-time tasks. It then sets watchdog_thresh to 60 and executes a
|
||||
real-time task ,a simple while(1) loop, on cpu3. Consequently, the final
|
||||
command gets blocked because the presence of this real-time thread
|
||||
prevents kworker:3 from being selected by the scheduler. This eventually
|
||||
triggers a softlockup detection on cpu3 due to watchdog_timer_fn operating
|
||||
with inconsistent variable - using both the old interval and the updated
|
||||
watchdog_thresh simultaneously.
|
||||
|
||||
[nysal@linux.ibm.com: fix the SOFTLOCKUP_DETECTOR=n case]
|
||||
Link: https://lkml.kernel.org/r/20250502111120.282690-1-nysal@linux.ibm.com
|
||||
Link: https://lkml.kernel.org/r/20250421035021.3507649-1-luogengkun@huaweicloud.com
|
||||
Signed-off-by: Luo Gengkun <luogengkun@huaweicloud.com>
|
||||
Signed-off-by: Nysal Jan K.A. <nysal@linux.ibm.com>
|
||||
Cc: Doug Anderson <dianders@chromium.org>
|
||||
Cc: Joel Granados <joel.granados@kernel.org>
|
||||
Cc: Song Liu <song@kernel.org>
|
||||
Cc: Thomas Gleinxer <tglx@linutronix.de>
|
||||
Cc: "Nysal Jan K.A." <nysal@linux.ibm.com>
|
||||
Cc: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
kernel/watchdog.c | 41 +++++++++++++++++++++++++++--------------
|
||||
1 file changed, 27 insertions(+), 14 deletions(-)
|
||||
|
||||
--- a/kernel/watchdog.c
|
||||
+++ b/kernel/watchdog.c
|
||||
@@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled
|
||||
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
|
||||
static int __read_mostly watchdog_softlockup_user_enabled = 1;
|
||||
int __read_mostly watchdog_thresh = 10;
|
||||
+static int __read_mostly watchdog_thresh_next;
|
||||
static int __read_mostly watchdog_hardlockup_available;
|
||||
|
||||
struct cpumask watchdog_cpumask __read_mostly;
|
||||
@@ -870,12 +871,20 @@ int lockup_detector_offline_cpu(unsigned
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static void __lockup_detector_reconfigure(void)
|
||||
+static void __lockup_detector_reconfigure(bool thresh_changed)
|
||||
{
|
||||
cpus_read_lock();
|
||||
watchdog_hardlockup_stop();
|
||||
|
||||
softlockup_stop_all();
|
||||
+ /*
|
||||
+ * To prevent watchdog_timer_fn from using the old interval and
|
||||
+ * the new watchdog_thresh at the same time, which could lead to
|
||||
+ * false softlockup reports, it is necessary to update the
|
||||
+ * watchdog_thresh after the softlockup is completed.
|
||||
+ */
|
||||
+ if (thresh_changed)
|
||||
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
|
||||
set_sample_period();
|
||||
lockup_detector_update_enable();
|
||||
if (watchdog_enabled && watchdog_thresh)
|
||||
@@ -888,7 +897,7 @@ static void __lockup_detector_reconfigur
|
||||
void lockup_detector_reconfigure(void)
|
||||
{
|
||||
mutex_lock(&watchdog_mutex);
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(false);
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
}
|
||||
|
||||
@@ -908,27 +917,29 @@ static __init void lockup_detector_setup
|
||||
return;
|
||||
|
||||
mutex_lock(&watchdog_mutex);
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(false);
|
||||
softlockup_initialized = true;
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
}
|
||||
|
||||
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
|
||||
-static void __lockup_detector_reconfigure(void)
|
||||
+static void __lockup_detector_reconfigure(bool thresh_changed)
|
||||
{
|
||||
cpus_read_lock();
|
||||
watchdog_hardlockup_stop();
|
||||
+ if (thresh_changed)
|
||||
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
|
||||
lockup_detector_update_enable();
|
||||
watchdog_hardlockup_start();
|
||||
cpus_read_unlock();
|
||||
}
|
||||
void lockup_detector_reconfigure(void)
|
||||
{
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(false);
|
||||
}
|
||||
static inline void lockup_detector_setup(void)
|
||||
{
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(false);
|
||||
}
|
||||
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
|
||||
|
||||
@@ -946,11 +957,11 @@ void lockup_detector_soft_poweroff(void)
|
||||
#ifdef CONFIG_SYSCTL
|
||||
|
||||
/* Propagate any changes to the watchdog infrastructure */
|
||||
-static void proc_watchdog_update(void)
|
||||
+static void proc_watchdog_update(bool thresh_changed)
|
||||
{
|
||||
/* Remove impossible cpus to keep sysctl output clean. */
|
||||
cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(thresh_changed);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -984,7 +995,7 @@ static int proc_watchdog_common(int whic
|
||||
} else {
|
||||
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (!err && old != READ_ONCE(*param))
|
||||
- proc_watchdog_update();
|
||||
+ proc_watchdog_update(false);
|
||||
}
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
return err;
|
||||
@@ -1035,11 +1046,13 @@ static int proc_watchdog_thresh(const st
|
||||
|
||||
mutex_lock(&watchdog_mutex);
|
||||
|
||||
- old = READ_ONCE(watchdog_thresh);
|
||||
+ watchdog_thresh_next = READ_ONCE(watchdog_thresh);
|
||||
+
|
||||
+ old = watchdog_thresh_next;
|
||||
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
|
||||
- if (!err && write && old != READ_ONCE(watchdog_thresh))
|
||||
- proc_watchdog_update();
|
||||
+ if (!err && write && old != READ_ONCE(watchdog_thresh_next))
|
||||
+ proc_watchdog_update(true);
|
||||
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
return err;
|
||||
@@ -1060,7 +1073,7 @@ static int proc_watchdog_cpumask(const s
|
||||
|
||||
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
|
||||
if (!err && write)
|
||||
- proc_watchdog_update();
|
||||
+ proc_watchdog_update(false);
|
||||
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
return err;
|
||||
@@ -1080,7 +1093,7 @@ static const struct ctl_table watchdog_s
|
||||
},
|
||||
{
|
||||
.procname = "watchdog_thresh",
|
||||
- .data = &watchdog_thresh,
|
||||
+ .data = &watchdog_thresh_next,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_watchdog_thresh,
|
@@ -1,288 +0,0 @@
|
||||
From ff8503c4997332bb5708c3b77f8a19f334e947a9 Mon Sep 17 00:00:00 2001
|
||||
From: Harshit Agarwal <harshit@nutanix.com>
|
||||
Date: Tue, 25 Feb 2025 18:05:53 +0000
|
||||
Subject: sched/rt: Fix race in push_rt_task
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Overview
|
||||
========
|
||||
When a CPU chooses to call push_rt_task and picks a task to push to
|
||||
another CPU's runqueue then it will call find_lock_lowest_rq method
|
||||
which would take a double lock on both CPUs' runqueues. If one of the
|
||||
locks aren't readily available, it may lead to dropping the current
|
||||
runqueue lock and reacquiring both the locks at once. During this window
|
||||
it is possible that the task is already migrated and is running on some
|
||||
other CPU. These cases are already handled. However, if the task is
|
||||
migrated and has already been executed and another CPU is now trying to
|
||||
wake it up (ttwu) such that it is queued again on the runqeue
|
||||
(on_rq is 1) and also if the task was run by the same CPU, then the
|
||||
current checks will pass even though the task was migrated out and is no
|
||||
longer in the pushable tasks list.
|
||||
|
||||
Crashes
|
||||
=======
|
||||
This bug resulted in quite a few flavors of crashes triggering kernel
|
||||
panics with various crash signatures such as assert failures, page
|
||||
faults, null pointer dereferences, and queue corruption errors all
|
||||
coming from scheduler itself.
|
||||
|
||||
Some of the crashes:
|
||||
-> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO)
|
||||
Call Trace:
|
||||
? __die_body+0x1a/0x60
|
||||
? die+0x2a/0x50
|
||||
? do_trap+0x85/0x100
|
||||
? pick_next_task_rt+0x6e/0x1d0
|
||||
? do_error_trap+0x64/0xa0
|
||||
? pick_next_task_rt+0x6e/0x1d0
|
||||
? exc_invalid_op+0x4c/0x60
|
||||
? pick_next_task_rt+0x6e/0x1d0
|
||||
? asm_exc_invalid_op+0x12/0x20
|
||||
? pick_next_task_rt+0x6e/0x1d0
|
||||
__schedule+0x5cb/0x790
|
||||
? update_ts_time_stats+0x55/0x70
|
||||
schedule_idle+0x1e/0x40
|
||||
do_idle+0x15e/0x200
|
||||
cpu_startup_entry+0x19/0x20
|
||||
start_secondary+0x117/0x160
|
||||
secondary_startup_64_no_verify+0xb0/0xbb
|
||||
|
||||
-> BUG: kernel NULL pointer dereference, address: 00000000000000c0
|
||||
Call Trace:
|
||||
? __die_body+0x1a/0x60
|
||||
? no_context+0x183/0x350
|
||||
? __warn+0x8a/0xe0
|
||||
? exc_page_fault+0x3d6/0x520
|
||||
? asm_exc_page_fault+0x1e/0x30
|
||||
? pick_next_task_rt+0xb5/0x1d0
|
||||
? pick_next_task_rt+0x8c/0x1d0
|
||||
__schedule+0x583/0x7e0
|
||||
? update_ts_time_stats+0x55/0x70
|
||||
schedule_idle+0x1e/0x40
|
||||
do_idle+0x15e/0x200
|
||||
cpu_startup_entry+0x19/0x20
|
||||
start_secondary+0x117/0x160
|
||||
secondary_startup_64_no_verify+0xb0/0xbb
|
||||
|
||||
-> BUG: unable to handle page fault for address: ffff9464daea5900
|
||||
kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p))
|
||||
|
||||
-> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running)
|
||||
Call Trace:
|
||||
? __die_body+0x1a/0x60
|
||||
? die+0x2a/0x50
|
||||
? do_trap+0x85/0x100
|
||||
? dequeue_top_rt_rq+0xa2/0xb0
|
||||
? do_error_trap+0x64/0xa0
|
||||
? dequeue_top_rt_rq+0xa2/0xb0
|
||||
? exc_invalid_op+0x4c/0x60
|
||||
? dequeue_top_rt_rq+0xa2/0xb0
|
||||
? asm_exc_invalid_op+0x12/0x20
|
||||
? dequeue_top_rt_rq+0xa2/0xb0
|
||||
dequeue_rt_entity+0x1f/0x70
|
||||
dequeue_task_rt+0x2d/0x70
|
||||
__schedule+0x1a8/0x7e0
|
||||
? blk_finish_plug+0x25/0x40
|
||||
schedule+0x3c/0xb0
|
||||
futex_wait_queue_me+0xb6/0x120
|
||||
futex_wait+0xd9/0x240
|
||||
do_futex+0x344/0xa90
|
||||
? get_mm_exe_file+0x30/0x60
|
||||
? audit_exe_compare+0x58/0x70
|
||||
? audit_filter_rules.constprop.26+0x65e/0x1220
|
||||
__x64_sys_futex+0x148/0x1f0
|
||||
do_syscall_64+0x30/0x80
|
||||
entry_SYSCALL_64_after_hwframe+0x62/0xc7
|
||||
|
||||
-> BUG: unable to handle page fault for address: ffff8cf3608bc2c0
|
||||
Call Trace:
|
||||
? __die_body+0x1a/0x60
|
||||
? no_context+0x183/0x350
|
||||
? spurious_kernel_fault+0x171/0x1c0
|
||||
? exc_page_fault+0x3b6/0x520
|
||||
? plist_check_list+0x15/0x40
|
||||
? plist_check_list+0x2e/0x40
|
||||
? asm_exc_page_fault+0x1e/0x30
|
||||
? _cond_resched+0x15/0x30
|
||||
? futex_wait_queue_me+0xc8/0x120
|
||||
? futex_wait+0xd9/0x240
|
||||
? try_to_wake_up+0x1b8/0x490
|
||||
? futex_wake+0x78/0x160
|
||||
? do_futex+0xcd/0xa90
|
||||
? plist_check_list+0x15/0x40
|
||||
? plist_check_list+0x2e/0x40
|
||||
? plist_del+0x6a/0xd0
|
||||
? plist_check_list+0x15/0x40
|
||||
? plist_check_list+0x2e/0x40
|
||||
? dequeue_pushable_task+0x20/0x70
|
||||
? __schedule+0x382/0x7e0
|
||||
? asm_sysvec_reschedule_ipi+0xa/0x20
|
||||
? schedule+0x3c/0xb0
|
||||
? exit_to_user_mode_prepare+0x9e/0x150
|
||||
? irqentry_exit_to_user_mode+0x5/0x30
|
||||
? asm_sysvec_reschedule_ipi+0x12/0x20
|
||||
|
||||
Above are some of the common examples of the crashes that were observed
|
||||
due to this issue.
|
||||
|
||||
Details
|
||||
=======
|
||||
Let's look at the following scenario to understand this race.
|
||||
|
||||
1) CPU A enters push_rt_task
|
||||
a) CPU A has chosen next_task = task p.
|
||||
b) CPU A calls find_lock_lowest_rq(Task p, CPU Z’s rq).
|
||||
c) CPU A identifies CPU X as a destination CPU (X < Z).
|
||||
d) CPU A enters double_lock_balance(CPU Z’s rq, CPU X’s rq).
|
||||
e) Since X is lower than Z, CPU A unlocks CPU Z’s rq. Someone else has
|
||||
locked CPU X’s rq, and thus, CPU A must wait.
|
||||
|
||||
2) At CPU Z
|
||||
a) Previous task has completed execution and thus, CPU Z enters
|
||||
schedule, locks its own rq after CPU A releases it.
|
||||
b) CPU Z dequeues previous task and begins executing task p.
|
||||
c) CPU Z unlocks its rq.
|
||||
d) Task p yields the CPU (ex. by doing IO or waiting to acquire a
|
||||
lock) which triggers the schedule function on CPU Z.
|
||||
e) CPU Z enters schedule again, locks its own rq, and dequeues task p.
|
||||
f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq.
|
||||
|
||||
3) At CPU B
|
||||
a) CPU B enters try_to_wake_up with input task p.
|
||||
b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates
|
||||
B.state = WAKING.
|
||||
c) CPU B via select_task_rq determines CPU Y as the target CPU.
|
||||
|
||||
4) The race
|
||||
a) CPU A acquires CPU X’s lock and relocks CPU Z.
|
||||
b) CPU A reads task p.cpu = Z and incorrectly concludes task p is
|
||||
still on CPU Z.
|
||||
c) CPU A failed to notice task p had been dequeued from CPU Z while
|
||||
CPU A was waiting for locks in double_lock_balance. If CPU A knew
|
||||
that task p had been dequeued, it would return NULL forcing
|
||||
push_rt_task to give up the task p's migration.
|
||||
d) CPU B updates task p.cpu = Y and calls ttwu_queue.
|
||||
e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task
|
||||
p.on_rq = 1.
|
||||
f) CPU B unlocks CPU Y, triggering memory synchronization.
|
||||
g) CPU A reads task p.on_rq = 1, cementing its assumption that task p
|
||||
has not migrated.
|
||||
h) CPU A decides to migrate p to CPU X.
|
||||
|
||||
This leads to A dequeuing p from Y's queue and various crashes down the
|
||||
line.
|
||||
|
||||
Solution
|
||||
========
|
||||
The solution here is fairly simple. After obtaining the lock (at 4a),
|
||||
the check is enhanced to make sure that the task is still at the head of
|
||||
the pushable tasks list. If not, then it is anyway not suitable for
|
||||
being pushed out.
|
||||
|
||||
Testing
|
||||
=======
|
||||
The fix is tested on a cluster of 3 nodes, where the panics due to this
|
||||
are hit every couple of days. A fix similar to this was deployed on such
|
||||
cluster and was stable for more than 30 days.
|
||||
|
||||
Co-developed-by: Jon Kohler <jon@nutanix.com>
|
||||
Signed-off-by: Jon Kohler <jon@nutanix.com>
|
||||
Co-developed-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
|
||||
Signed-off-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
|
||||
Co-developed-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
|
||||
Signed-off-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
|
||||
Signed-off-by: Harshit Agarwal <harshit@nutanix.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Reviewed-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
|
||||
Reviewed-by: Phil Auld <pauld@redhat.com>
|
||||
Tested-by: Will Ton <william.ton@nutanix.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lore.kernel.org/r/20250225180553.167995-1-harshit@nutanix.com
|
||||
---
|
||||
kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------
|
||||
1 file changed, 26 insertions(+), 28 deletions(-)
|
||||
|
||||
--- a/kernel/sched/rt.c
|
||||
+++ b/kernel/sched/rt.c
|
||||
@@ -1883,6 +1883,27 @@ static int find_lowest_rq(struct task_st
|
||||
return -1;
|
||||
}
|
||||
|
||||
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
|
||||
+{
|
||||
+ struct task_struct *p;
|
||||
+
|
||||
+ if (!has_pushable_tasks(rq))
|
||||
+ return NULL;
|
||||
+
|
||||
+ p = plist_first_entry(&rq->rt.pushable_tasks,
|
||||
+ struct task_struct, pushable_tasks);
|
||||
+
|
||||
+ BUG_ON(rq->cpu != task_cpu(p));
|
||||
+ BUG_ON(task_current(rq, p));
|
||||
+ BUG_ON(task_current_donor(rq, p));
|
||||
+ BUG_ON(p->nr_cpus_allowed <= 1);
|
||||
+
|
||||
+ BUG_ON(!task_on_rq_queued(p));
|
||||
+ BUG_ON(!rt_task(p));
|
||||
+
|
||||
+ return p;
|
||||
+}
|
||||
+
|
||||
/* Will lock the rq it finds */
|
||||
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
|
||||
{
|
||||
@@ -1913,18 +1934,16 @@ static struct rq *find_lock_lowest_rq(st
|
||||
/*
|
||||
* We had to unlock the run queue. In
|
||||
* the mean time, task could have
|
||||
- * migrated already or had its affinity changed.
|
||||
- * Also make sure that it wasn't scheduled on its rq.
|
||||
+ * migrated already or had its affinity changed,
|
||||
+ * therefore check if the task is still at the
|
||||
+ * head of the pushable tasks list.
|
||||
* It is possible the task was scheduled, set
|
||||
* "migrate_disabled" and then got preempted, so we must
|
||||
* check the task migration disable flag here too.
|
||||
*/
|
||||
- if (unlikely(task_rq(task) != rq ||
|
||||
+ if (unlikely(is_migration_disabled(task) ||
|
||||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
|
||||
- task_on_cpu(rq, task) ||
|
||||
- !rt_task(task) ||
|
||||
- is_migration_disabled(task) ||
|
||||
- !task_on_rq_queued(task))) {
|
||||
+ task != pick_next_pushable_task(rq))) {
|
||||
|
||||
double_unlock_balance(rq, lowest_rq);
|
||||
lowest_rq = NULL;
|
||||
@@ -1944,27 +1963,6 @@ static struct rq *find_lock_lowest_rq(st
|
||||
return lowest_rq;
|
||||
}
|
||||
|
||||
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
|
||||
-{
|
||||
- struct task_struct *p;
|
||||
-
|
||||
- if (!has_pushable_tasks(rq))
|
||||
- return NULL;
|
||||
-
|
||||
- p = plist_first_entry(&rq->rt.pushable_tasks,
|
||||
- struct task_struct, pushable_tasks);
|
||||
-
|
||||
- BUG_ON(rq->cpu != task_cpu(p));
|
||||
- BUG_ON(task_current(rq, p));
|
||||
- BUG_ON(task_current_donor(rq, p));
|
||||
- BUG_ON(p->nr_cpus_allowed <= 1);
|
||||
-
|
||||
- BUG_ON(!task_on_rq_queued(p));
|
||||
- BUG_ON(!rt_task(p));
|
||||
-
|
||||
- return p;
|
||||
-}
|
||||
-
|
||||
/*
|
||||
* If the current CPU has more than one RT task, see if the non
|
||||
* running task can migrate over to a CPU that is running a task
|
@@ -1,62 +0,0 @@
|
||||
From e02cbdc12bf63da363d7e3391376819241d67fbe Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Tue, 28 Jan 2025 15:39:49 +0100
|
||||
Subject: sched/fair: Adhere to place_entity() constraints
|
||||
|
||||
Mike reports that commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
|
||||
placement bug causing scheduling lag") relies on commit 4423af84b297
|
||||
("sched/fair: optimize the PLACE_LAG when se->vlag is zero") to not
|
||||
trip a WARN in place_entity().
|
||||
|
||||
What happens is that the lag of the very last entity is 0 per
|
||||
definition -- the average of one element matches the value of that
|
||||
element. Therefore place_entity() will match the condition skipping
|
||||
the lag adjustment:
|
||||
|
||||
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
|
||||
|
||||
Without the 'se->vlag' condition -- it will attempt to adjust the zero
|
||||
lag even though we're inserting into an empty tree.
|
||||
|
||||
Notably, we should have failed the 'cfs_rq->nr_queued' condition, but
|
||||
don't because they didn't get updated.
|
||||
|
||||
Additionally, move update_load_add() after placement() as is
|
||||
consistent with other place_entity() users -- this change is
|
||||
non-functional, place_entity() does not use cfs_rq->load.
|
||||
|
||||
Fixes: 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Reported-by: Mike Galbraith <efault@gmx.de>
|
||||
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
|
||||
Signed-off-by: Mike Galbraith <efault@gmx.de>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lore.kernel.org/r/c216eb4ef0e0e0029c600aefc69d56681cee5581.camel@gmx.de
|
||||
---
|
||||
kernel/sched/fair.c | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -3808,6 +3808,7 @@ static void reweight_entity(struct cfs_r
|
||||
update_entity_lag(cfs_rq, se);
|
||||
se->deadline -= se->vruntime;
|
||||
se->rel_deadline = 1;
|
||||
+ cfs_rq->nr_queued--;
|
||||
if (!curr)
|
||||
__dequeue_entity(cfs_rq, se);
|
||||
update_load_sub(&cfs_rq->load, se->load.weight);
|
||||
@@ -3834,10 +3835,11 @@ static void reweight_entity(struct cfs_r
|
||||
|
||||
enqueue_load_avg(cfs_rq, se);
|
||||
if (se->on_rq) {
|
||||
- update_load_add(&cfs_rq->load, se->load.weight);
|
||||
place_entity(cfs_rq, se, 0);
|
||||
+ update_load_add(&cfs_rq->load, se->load.weight);
|
||||
if (!curr)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
+ cfs_rq->nr_queued++;
|
||||
|
||||
/*
|
||||
* The entity's vruntime has been adjusted, so let's check
|
@@ -1,184 +0,0 @@
|
||||
From 7257e4f8df6b5783978ab06063fc8529ee2631d5 Mon Sep 17 00:00:00 2001
|
||||
From: Suren Baghdasaryan <surenb@google.com>
|
||||
Date: Wed, 21 May 2025 09:06:02 -0700
|
||||
Subject: alloc_tag: handle module codetag load errors as module load failures
|
||||
|
||||
Failures inside codetag_load_module() are currently ignored. As a result
|
||||
an error there would not cause a module load failure and freeing of the
|
||||
associated resources. Correct this behavior by propagating the error code
|
||||
to the caller and handling possible errors. With this change, error to
|
||||
allocate percpu counters, which happens at this stage, will not be ignored
|
||||
and will cause a module load failure and freeing of resources. With this
|
||||
change we also do not need to disable memory allocation profiling when
|
||||
this error happens, instead we fail to load the module.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250521160602.1940771-1-surenb@google.com
|
||||
Fixes: 10075262888b ("alloc_tag: allocate percpu counters for module tags dynamically")
|
||||
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
|
||||
Reported-by: Casey Chen <cachen@purestorage.com>
|
||||
Closes: https://lore.kernel.org/all/20250520231620.15259-1-cachen@purestorage.com/
|
||||
Cc: Daniel Gomez <da.gomez@samsung.com>
|
||||
Cc: David Wang <00107082@163.com>
|
||||
Cc: Kent Overstreet <kent.overstreet@linux.dev>
|
||||
Cc: Luis Chamberalin <mcgrof@kernel.org>
|
||||
Cc: Petr Pavlu <petr.pavlu@suse.com>
|
||||
Cc: Sami Tolvanen <samitolvanen@google.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/codetag.h | 8 ++++----
|
||||
kernel/module/main.c | 5 +++--
|
||||
lib/alloc_tag.c | 12 +++++++-----
|
||||
lib/codetag.c | 34 +++++++++++++++++++++++++---------
|
||||
4 files changed, 39 insertions(+), 20 deletions(-)
|
||||
|
||||
--- a/include/linux/codetag.h
|
||||
+++ b/include/linux/codetag.h
|
||||
@@ -36,8 +36,8 @@ union codetag_ref {
|
||||
struct codetag_type_desc {
|
||||
const char *section;
|
||||
size_t tag_size;
|
||||
- void (*module_load)(struct module *mod,
|
||||
- struct codetag *start, struct codetag *end);
|
||||
+ int (*module_load)(struct module *mod,
|
||||
+ struct codetag *start, struct codetag *end);
|
||||
void (*module_unload)(struct module *mod,
|
||||
struct codetag *start, struct codetag *end);
|
||||
#ifdef CONFIG_MODULES
|
||||
@@ -89,7 +89,7 @@ void *codetag_alloc_module_section(struc
|
||||
unsigned long align);
|
||||
void codetag_free_module_sections(struct module *mod);
|
||||
void codetag_module_replaced(struct module *mod, struct module *new_mod);
|
||||
-void codetag_load_module(struct module *mod);
|
||||
+int codetag_load_module(struct module *mod);
|
||||
void codetag_unload_module(struct module *mod);
|
||||
|
||||
#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
|
||||
@@ -103,7 +103,7 @@ codetag_alloc_module_section(struct modu
|
||||
unsigned long align) { return NULL; }
|
||||
static inline void codetag_free_module_sections(struct module *mod) {}
|
||||
static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {}
|
||||
-static inline void codetag_load_module(struct module *mod) {}
|
||||
+static inline int codetag_load_module(struct module *mod) { return 0; }
|
||||
static inline void codetag_unload_module(struct module *mod) {}
|
||||
|
||||
#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
|
||||
--- a/kernel/module/main.c
|
||||
+++ b/kernel/module/main.c
|
||||
@@ -3399,11 +3399,12 @@ static int load_module(struct load_info
|
||||
goto sysfs_cleanup;
|
||||
}
|
||||
|
||||
+ if (codetag_load_module(mod))
|
||||
+ goto sysfs_cleanup;
|
||||
+
|
||||
/* Get rid of temporary copy. */
|
||||
free_copy(info, flags);
|
||||
|
||||
- codetag_load_module(mod);
|
||||
-
|
||||
/* Done! */
|
||||
trace_module_load(mod);
|
||||
|
||||
--- a/lib/alloc_tag.c
|
||||
+++ b/lib/alloc_tag.c
|
||||
@@ -618,15 +618,16 @@ out:
|
||||
mas_unlock(&mas);
|
||||
}
|
||||
|
||||
-static void load_module(struct module *mod, struct codetag *start, struct codetag *stop)
|
||||
+static int load_module(struct module *mod, struct codetag *start, struct codetag *stop)
|
||||
{
|
||||
/* Allocate module alloc_tag percpu counters */
|
||||
struct alloc_tag *start_tag;
|
||||
struct alloc_tag *stop_tag;
|
||||
struct alloc_tag *tag;
|
||||
|
||||
+ /* percpu counters for core allocations are already statically allocated */
|
||||
if (!mod)
|
||||
- return;
|
||||
+ return 0;
|
||||
|
||||
start_tag = ct_to_alloc_tag(start);
|
||||
stop_tag = ct_to_alloc_tag(stop);
|
||||
@@ -638,12 +639,13 @@ static void load_module(struct module *m
|
||||
free_percpu(tag->counters);
|
||||
tag->counters = NULL;
|
||||
}
|
||||
- shutdown_mem_profiling(true);
|
||||
- pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s. Memory allocation profiling is disabled!\n",
|
||||
+ pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n",
|
||||
mod->name);
|
||||
- break;
|
||||
+ return -ENOMEM;
|
||||
}
|
||||
}
|
||||
+
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static void replace_module(struct module *mod, struct module *new_mod)
|
||||
--- a/lib/codetag.c
|
||||
+++ b/lib/codetag.c
|
||||
@@ -167,6 +167,7 @@ static int codetag_module_init(struct co
|
||||
{
|
||||
struct codetag_range range;
|
||||
struct codetag_module *cmod;
|
||||
+ int mod_id;
|
||||
int err;
|
||||
|
||||
range = get_section_range(mod, cttype->desc.section);
|
||||
@@ -190,11 +191,20 @@ static int codetag_module_init(struct co
|
||||
cmod->range = range;
|
||||
|
||||
down_write(&cttype->mod_lock);
|
||||
- err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
|
||||
- if (err >= 0) {
|
||||
- cttype->count += range_size(cttype, &range);
|
||||
- if (cttype->desc.module_load)
|
||||
- cttype->desc.module_load(mod, range.start, range.stop);
|
||||
+ mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
|
||||
+ if (mod_id >= 0) {
|
||||
+ if (cttype->desc.module_load) {
|
||||
+ err = cttype->desc.module_load(mod, range.start, range.stop);
|
||||
+ if (!err)
|
||||
+ cttype->count += range_size(cttype, &range);
|
||||
+ else
|
||||
+ idr_remove(&cttype->mod_idr, mod_id);
|
||||
+ } else {
|
||||
+ cttype->count += range_size(cttype, &range);
|
||||
+ err = 0;
|
||||
+ }
|
||||
+ } else {
|
||||
+ err = mod_id;
|
||||
}
|
||||
up_write(&cttype->mod_lock);
|
||||
|
||||
@@ -295,17 +305,23 @@ void codetag_module_replaced(struct modu
|
||||
mutex_unlock(&codetag_lock);
|
||||
}
|
||||
|
||||
-void codetag_load_module(struct module *mod)
|
||||
+int codetag_load_module(struct module *mod)
|
||||
{
|
||||
struct codetag_type *cttype;
|
||||
+ int ret = 0;
|
||||
|
||||
if (!mod)
|
||||
- return;
|
||||
+ return 0;
|
||||
|
||||
mutex_lock(&codetag_lock);
|
||||
- list_for_each_entry(cttype, &codetag_types, link)
|
||||
- codetag_module_init(cttype, mod);
|
||||
+ list_for_each_entry(cttype, &codetag_types, link) {
|
||||
+ ret = codetag_module_init(cttype, mod);
|
||||
+ if (ret)
|
||||
+ break;
|
||||
+ }
|
||||
mutex_unlock(&codetag_lock);
|
||||
+
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
void codetag_unload_module(struct module *mod)
|
@@ -1,29 +0,0 @@
|
||||
From 57fdc30dcdad60e3b868682cc1e77083c091aef5 Mon Sep 17 00:00:00 2001
|
||||
From: Chuck Lever <chuck.lever@oracle.com>
|
||||
Date: Sun, 27 Apr 2025 12:39:59 -0400
|
||||
Subject: svcrdma: Unregister the device if svc_rdma_accept() fails
|
||||
|
||||
To handle device removal, svc_rdma_accept() requests removal
|
||||
notification for the underlying device when accepting a connection.
|
||||
However svc_rdma_free() is not invoked if svc_rdma_accept() fails.
|
||||
There needs to be a matching "unregister" in that case; otherwise
|
||||
the device cannot be removed.
|
||||
|
||||
Fixes: c4de97f7c454 ("svcrdma: Handle device removal outside of the CM event handler")
|
||||
Cc: stable@vger.kernel.org
|
||||
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
---
|
||||
net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
|
||||
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
|
||||
@@ -577,6 +577,7 @@ static struct svc_xprt *svc_rdma_accept(
|
||||
if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
|
||||
ib_destroy_qp(newxprt->sc_qp);
|
||||
rdma_destroy_id(newxprt->sc_cm_id);
|
||||
+ rpcrdma_rn_unregister(dev, &newxprt->sc_rn);
|
||||
/* This call to put will destroy the transport */
|
||||
svc_xprt_put(&newxprt->sc_xprt);
|
||||
return NULL;
|
@@ -1,53 +0,0 @@
|
||||
From 92e99ba55ff0ce68ea7567331beda21861da2028 Mon Sep 17 00:00:00 2001
|
||||
From: Chuck Lever <chuck.lever@oracle.com>
|
||||
Date: Wed, 21 May 2025 16:34:13 -0400
|
||||
Subject: SUNRPC: Prevent hang on NFS mount with xprtsec=[m]tls
|
||||
|
||||
Engineers at Hammerspace noticed that sometimes mounting with
|
||||
"xprtsec=tls" hangs for a minute or so, and then times out, even
|
||||
when the NFS server is reachable and responsive.
|
||||
|
||||
kTLS shuts off data_ready callbacks if strp->msg_ready is set to
|
||||
mitigate data_ready callbacks when a full TLS record is not yet
|
||||
ready to be read from the socket.
|
||||
|
||||
Normally msg_ready is clear when the first TLS record arrives on
|
||||
a socket. However, I observed that sometimes tls_setsockopt() sets
|
||||
strp->msg_ready, and that prevents forward progress because
|
||||
tls_data_ready() becomes a no-op.
|
||||
|
||||
Moreover, Jakub says: "If there's a full record queued at the time
|
||||
when [tlshd] passes the socket back to the kernel, it's up to the
|
||||
reader to read the already queued data out." So SunRPC cannot
|
||||
expect a data_ready call when ingress data is already waiting.
|
||||
|
||||
Add an explicit poll after SunRPC's upper transport is set up to
|
||||
pick up any data that arrived after the TLS handshake but before
|
||||
transport set-up is complete.
|
||||
|
||||
Reported-by: Steve Sears <sjs@hammerspace.com>
|
||||
Suggested-by: Jakub Kacinski <kuba@kernel.org>
|
||||
Fixes: 75eb6af7acdf ("SUNRPC: Add a TCP-with-TLS RPC transport class")
|
||||
Tested-by: Mike Snitzer <snitzer@kernel.org>
|
||||
Reviewed-by: Mike Snitzer <snitzer@kernel.org>
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
|
||||
---
|
||||
net/sunrpc/xprtsock.c | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
--- a/net/sunrpc/xprtsock.c
|
||||
+++ b/net/sunrpc/xprtsock.c
|
||||
@@ -2740,6 +2740,11 @@ static void xs_tcp_tls_setup_socket(stru
|
||||
}
|
||||
rpc_shutdown_client(lower_clnt);
|
||||
|
||||
+ /* Check for ingress data that arrived before the socket's
|
||||
+ * ->data_ready callback was set up.
|
||||
+ */
|
||||
+ xs_poll_check_readable(upper_transport);
|
||||
+
|
||||
out_unlock:
|
||||
current_restore_flags(pflags, PF_MEMALLOC);
|
||||
upper_transport->clnt = NULL;
|
@@ -1,89 +0,0 @@
|
||||
From ac0c5ac5efecec7f731a1d80ec40ef3d34adc5ee Mon Sep 17 00:00:00 2001
|
||||
From: Saurabh Sengar <ssengar@linux.microsoft.com>
|
||||
Date: Thu, 29 May 2025 03:18:30 -0700
|
||||
Subject: hv_netvsc: fix potential deadlock in netvsc_vf_setxdp()
|
||||
|
||||
The MANA driver's probe registers netdevice via the following call chain:
|
||||
|
||||
mana_probe()
|
||||
register_netdev()
|
||||
register_netdevice()
|
||||
|
||||
register_netdevice() calls notifier callback for netvsc driver,
|
||||
holding the netdev mutex via netdev_lock_ops().
|
||||
|
||||
Further this netvsc notifier callback end up attempting to acquire the
|
||||
same lock again in dev_xdp_propagate() leading to deadlock.
|
||||
|
||||
netvsc_netdev_event()
|
||||
netvsc_vf_setxdp()
|
||||
dev_xdp_propagate()
|
||||
|
||||
This deadlock was not observed so far because net_shaper_ops was never set,
|
||||
and thus the lock was effectively a no-op in this case. Fix this by using
|
||||
netif_xdp_propagate() instead of dev_xdp_propagate() to avoid recursive
|
||||
locking in this path.
|
||||
|
||||
And, since no deadlock is observed on the other path which is via
|
||||
netvsc_probe, add the lock exclusivly for that path.
|
||||
|
||||
Also, clean up the unregistration path by removing the unnecessary call to
|
||||
netvsc_vf_setxdp(), since unregister_netdevice_many_notify() already
|
||||
performs this cleanup via dev_xdp_uninstall().
|
||||
|
||||
Fixes: 97246d6d21c2 ("net: hold netdev instance lock during ndo_bpf")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
|
||||
Tested-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
|
||||
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
|
||||
Reviewed-by: Subbaraya Sundeep <sbhatta@marvell.com>
|
||||
Link: https://patch.msgid.link/1748513910-23963-1-git-send-email-ssengar@linux.microsoft.com
|
||||
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
||||
---
|
||||
drivers/net/hyperv/netvsc_bpf.c | 2 +-
|
||||
drivers/net/hyperv/netvsc_drv.c | 4 ++--
|
||||
net/core/dev.c | 1 +
|
||||
3 files changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/drivers/net/hyperv/netvsc_bpf.c
|
||||
+++ b/drivers/net/hyperv/netvsc_bpf.c
|
||||
@@ -183,7 +183,7 @@ int netvsc_vf_setxdp(struct net_device *
|
||||
xdp.command = XDP_SETUP_PROG;
|
||||
xdp.prog = prog;
|
||||
|
||||
- ret = dev_xdp_propagate(vf_netdev, &xdp);
|
||||
+ ret = netif_xdp_propagate(vf_netdev, &xdp);
|
||||
|
||||
if (ret && prog)
|
||||
bpf_prog_put(prog);
|
||||
--- a/drivers/net/hyperv/netvsc_drv.c
|
||||
+++ b/drivers/net/hyperv/netvsc_drv.c
|
||||
@@ -2462,8 +2462,6 @@ static int netvsc_unregister_vf(struct n
|
||||
|
||||
netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
|
||||
|
||||
- netvsc_vf_setxdp(vf_netdev, NULL);
|
||||
-
|
||||
reinit_completion(&net_device_ctx->vf_add);
|
||||
netdev_rx_handler_unregister(vf_netdev);
|
||||
netdev_upper_dev_unlink(vf_netdev, ndev);
|
||||
@@ -2631,7 +2629,9 @@ static int netvsc_probe(struct hv_device
|
||||
continue;
|
||||
|
||||
netvsc_prepare_bonding(vf_netdev);
|
||||
+ netdev_lock_ops(vf_netdev);
|
||||
netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE);
|
||||
+ netdev_unlock_ops(vf_netdev);
|
||||
__netvsc_vf_setup(net, vf_netdev);
|
||||
break;
|
||||
}
|
||||
--- a/net/core/dev.c
|
||||
+++ b/net/core/dev.c
|
||||
@@ -9863,6 +9863,7 @@ int netif_xdp_propagate(struct net_devic
|
||||
|
||||
return dev->netdev_ops->ndo_bpf(dev, bpf);
|
||||
}
|
||||
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
|
||||
|
||||
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
|
||||
{
|
@@ -1,113 +0,0 @@
|
||||
From 485c82a86fb97fb86cac303348c85b6cf71fd787 Mon Sep 17 00:00:00 2001
|
||||
From: Jakub Kicinski <kuba@kernel.org>
|
||||
Date: Mon, 9 Jun 2025 17:12:44 -0700
|
||||
Subject: net: clear the dst when changing skb protocol
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
A not-so-careful NAT46 BPF program can crash the kernel
|
||||
if it indiscriminately flips ingress packets from v4 to v6:
|
||||
|
||||
BUG: kernel NULL pointer dereference, address: 0000000000000000
|
||||
ip6_rcv_core (net/ipv6/ip6_input.c:190:20)
|
||||
ipv6_rcv (net/ipv6/ip6_input.c:306:8)
|
||||
process_backlog (net/core/dev.c:6186:4)
|
||||
napi_poll (net/core/dev.c:6906:9)
|
||||
net_rx_action (net/core/dev.c:7028:13)
|
||||
do_softirq (kernel/softirq.c:462:3)
|
||||
netif_rx (net/core/dev.c:5326:3)
|
||||
dev_loopback_xmit (net/core/dev.c:4015:2)
|
||||
ip_mc_finish_output (net/ipv4/ip_output.c:363:8)
|
||||
NF_HOOK (./include/linux/netfilter.h:314:9)
|
||||
ip_mc_output (net/ipv4/ip_output.c:400:5)
|
||||
dst_output (./include/net/dst.h:459:9)
|
||||
ip_local_out (net/ipv4/ip_output.c:130:9)
|
||||
ip_send_skb (net/ipv4/ip_output.c:1496:8)
|
||||
udp_send_skb (net/ipv4/udp.c:1040:8)
|
||||
udp_sendmsg (net/ipv4/udp.c:1328:10)
|
||||
|
||||
The output interface has a 4->6 program attached at ingress.
|
||||
We try to loop the multicast skb back to the sending socket.
|
||||
Ingress BPF runs as part of netif_rx(), pushes a valid v6 hdr
|
||||
and changes skb->protocol to v6. We enter ip6_rcv_core which
|
||||
tries to use skb_dst(). But the dst is still an IPv4 one left
|
||||
after IPv4 mcast output.
|
||||
|
||||
Clear the dst in all BPF helpers which change the protocol.
|
||||
Try to preserve metadata dsts, those may carry non-routing
|
||||
metadata.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Reviewed-by: Maciej Żenczykowski <maze@google.com>
|
||||
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
|
||||
Fixes: d219df60a70e ("bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()")
|
||||
Fixes: 1b00e0dfe7d0 ("bpf: update skb->protocol in bpf_skb_net_grow")
|
||||
Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
|
||||
Reviewed-by: Willem de Bruijn <willemb@google.com>
|
||||
Link: https://patch.msgid.link/20250610001245.1981782-1-kuba@kernel.org
|
||||
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
||||
---
|
||||
net/core/filter.c | 19 +++++++++++++------
|
||||
1 file changed, 13 insertions(+), 6 deletions(-)
|
||||
|
||||
--- a/net/core/filter.c
|
||||
+++ b/net/core/filter.c
|
||||
@@ -3233,6 +3233,13 @@ static const struct bpf_func_proto bpf_s
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
};
|
||||
|
||||
+static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
|
||||
+{
|
||||
+ skb->protocol = htons(proto);
|
||||
+ if (skb_valid_dst(skb))
|
||||
+ skb_dst_drop(skb);
|
||||
+}
|
||||
+
|
||||
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
|
||||
{
|
||||
/* Caller already did skb_cow() with len as headroom,
|
||||
@@ -3329,7 +3336,7 @@ static int bpf_skb_proto_4_to_6(struct s
|
||||
}
|
||||
}
|
||||
|
||||
- skb->protocol = htons(ETH_P_IPV6);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
|
||||
skb_clear_hash(skb);
|
||||
|
||||
return 0;
|
||||
@@ -3359,7 +3366,7 @@ static int bpf_skb_proto_6_to_4(struct s
|
||||
}
|
||||
}
|
||||
|
||||
- skb->protocol = htons(ETH_P_IP);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IP);
|
||||
skb_clear_hash(skb);
|
||||
|
||||
return 0;
|
||||
@@ -3550,10 +3557,10 @@ static int bpf_skb_net_grow(struct sk_bu
|
||||
/* Match skb->protocol to new outer l3 protocol */
|
||||
if (skb->protocol == htons(ETH_P_IP) &&
|
||||
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
|
||||
- skb->protocol = htons(ETH_P_IPV6);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
|
||||
else if (skb->protocol == htons(ETH_P_IPV6) &&
|
||||
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
|
||||
- skb->protocol = htons(ETH_P_IP);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IP);
|
||||
}
|
||||
|
||||
if (skb_is_gso(skb)) {
|
||||
@@ -3606,10 +3613,10 @@ static int bpf_skb_net_shrink(struct sk_
|
||||
/* Match skb->protocol to new outer l3 protocol */
|
||||
if (skb->protocol == htons(ETH_P_IP) &&
|
||||
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
|
||||
- skb->protocol = htons(ETH_P_IPV6);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
|
||||
else if (skb->protocol == htons(ETH_P_IPV6) &&
|
||||
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
|
||||
- skb->protocol = htons(ETH_P_IP);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IP);
|
||||
|
||||
if (skb_is_gso(skb)) {
|
||||
struct skb_shared_info *shinfo = skb_shinfo(skb);
|
@@ -1,67 +0,0 @@
|
||||
From 2bf1f4a3adcecc53c1012e460d1412cece3747ce Mon Sep 17 00:00:00 2001
|
||||
From: Eric Dumazet <edumazet@google.com>
|
||||
Date: Wed, 11 Jun 2025 08:35:01 +0000
|
||||
Subject: net_sched: sch_sfq: reject invalid perturb period
|
||||
|
||||
Gerrard Tai reported that SFQ perturb_period has no range check yet,
|
||||
and this can be used to trigger a race condition fixed in a separate patch.
|
||||
|
||||
We want to make sure ctl->perturb_period * HZ will not overflow
|
||||
and is positive.
|
||||
|
||||
Tested:
|
||||
|
||||
tc qd add dev lo root sfq perturb -10 # negative value : error
|
||||
Error: sch_sfq: invalid perturb period.
|
||||
|
||||
tc qd add dev lo root sfq perturb 1000000000 # too big : error
|
||||
Error: sch_sfq: invalid perturb period.
|
||||
|
||||
tc qd add dev lo root sfq perturb 2000000 # acceptable value
|
||||
tc -s -d qd sh dev lo
|
||||
qdisc sfq 8005: root refcnt 2 limit 127p quantum 64Kb depth 127 flows 128 divisor 1024 perturb 2000000sec
|
||||
Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
|
||||
backlog 0b 0p requeues 0
|
||||
|
||||
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
|
||||
Reported-by: Gerrard Tai <gerrard.tai@starlabs.sg>
|
||||
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://patch.msgid.link/20250611083501.1810459-1-edumazet@google.com
|
||||
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
||||
---
|
||||
net/sched/sch_sfq.c | 10 ++++++++--
|
||||
1 file changed, 8 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/net/sched/sch_sfq.c
|
||||
+++ b/net/sched/sch_sfq.c
|
||||
@@ -656,6 +656,14 @@ static int sfq_change(struct Qdisc *sch,
|
||||
NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
|
||||
return -EINVAL;
|
||||
}
|
||||
+
|
||||
+ if (ctl->perturb_period < 0 ||
|
||||
+ ctl->perturb_period > INT_MAX / HZ) {
|
||||
+ NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+ perturb_period = ctl->perturb_period * HZ;
|
||||
+
|
||||
if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
|
||||
ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
|
||||
return -EINVAL;
|
||||
@@ -672,14 +680,12 @@ static int sfq_change(struct Qdisc *sch,
|
||||
headdrop = q->headdrop;
|
||||
maxdepth = q->maxdepth;
|
||||
maxflows = q->maxflows;
|
||||
- perturb_period = q->perturb_period;
|
||||
quantum = q->quantum;
|
||||
flags = q->flags;
|
||||
|
||||
/* update and validate configuration */
|
||||
if (ctl->quantum)
|
||||
quantum = ctl->quantum;
|
||||
- perturb_period = ctl->perturb_period * HZ;
|
||||
if (ctl->flows)
|
||||
maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
|
||||
if (ctl->divisor) {
|
@@ -1,93 +0,0 @@
|
||||
From 90a5248443f925040b46e32fcf6715615c73e396 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Date: Fri, 6 Jun 2025 13:50:32 +0100
|
||||
Subject: mm/vma: reset VMA iterator on commit_merge() OOM failure
|
||||
|
||||
While an OOM failure in commit_merge() isn't really feasible due to the
|
||||
allocation which might fail (a maple tree pre-allocation) being 'too small
|
||||
to fail', we do need to handle this case correctly regardless.
|
||||
|
||||
In vma_merge_existing_range(), we can theoretically encounter failures
|
||||
which result in an OOM error in two ways - firstly dup_anon_vma() might
|
||||
fail with an OOM error, and secondly commit_merge() failing, ultimately,
|
||||
to pre-allocate a maple tree node.
|
||||
|
||||
The abort logic for dup_anon_vma() resets the VMA iterator to the initial
|
||||
range, ensuring that any logic looping on this iterator will correctly
|
||||
proceed to the next VMA.
|
||||
|
||||
However the commit_merge() abort logic does not do the same thing. This
|
||||
resulted in a syzbot report occurring because mlockall() iterates through
|
||||
VMAs, is tolerant of errors, but ended up with an incorrect previous VMA
|
||||
being specified due to incorrect iterator state.
|
||||
|
||||
While making this change, it became apparent we are duplicating logic -
|
||||
the logic introduced in commit 41e6ddcaa0f1 ("mm/vma: add give_up_on_oom
|
||||
option on modify/merge, use in uffd release") duplicates the
|
||||
vmg->give_up_on_oom check in both abort branches.
|
||||
|
||||
Additionally, we observe that we can perform the anon_dup check safely on
|
||||
dup_anon_vma() failure, as this will not be modified should this call
|
||||
fail.
|
||||
|
||||
Finally, we need to reset the iterator in both cases, so now we can simply
|
||||
use the exact same code to abort for both.
|
||||
|
||||
We remove the VM_WARN_ON(err != -ENOMEM) as it would be silly for this to
|
||||
be otherwise and it allows us to implement the abort check more neatly.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250606125032.164249-1-lorenzo.stoakes@oracle.com
|
||||
Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure")
|
||||
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Reported-by: syzbot+d16409ea9ecc16ed261a@syzkaller.appspotmail.com
|
||||
Closes: https://lore.kernel.org/linux-mm/6842cc67.a00a0220.29ac89.003b.GAE@google.com/
|
||||
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
|
||||
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
|
||||
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
|
||||
Cc: Jann Horn <jannh@google.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vma.c | 22 ++++------------------
|
||||
1 file changed, 4 insertions(+), 18 deletions(-)
|
||||
|
||||
--- a/mm/vma.c
|
||||
+++ b/mm/vma.c
|
||||
@@ -927,26 +927,9 @@ static __must_check struct vm_area_struc
|
||||
err = dup_anon_vma(next, middle, &anon_dup);
|
||||
}
|
||||
|
||||
- if (err)
|
||||
+ if (err || commit_merge(vmg))
|
||||
goto abort;
|
||||
|
||||
- err = commit_merge(vmg);
|
||||
- if (err) {
|
||||
- VM_WARN_ON(err != -ENOMEM);
|
||||
-
|
||||
- if (anon_dup)
|
||||
- unlink_anon_vmas(anon_dup);
|
||||
-
|
||||
- /*
|
||||
- * We've cleaned up any cloned anon_vma's, no VMAs have been
|
||||
- * modified, no harm no foul if the user requests that we not
|
||||
- * report this and just give up, leaving the VMAs unmerged.
|
||||
- */
|
||||
- if (!vmg->give_up_on_oom)
|
||||
- vmg->state = VMA_MERGE_ERROR_NOMEM;
|
||||
- return NULL;
|
||||
- }
|
||||
-
|
||||
khugepaged_enter_vma(vmg->target, vmg->flags);
|
||||
vmg->state = VMA_MERGE_SUCCESS;
|
||||
return vmg->target;
|
||||
@@ -955,6 +938,9 @@ abort:
|
||||
vma_iter_set(vmg->vmi, start);
|
||||
vma_iter_load(vmg->vmi);
|
||||
|
||||
+ if (anon_dup)
|
||||
+ unlink_anon_vmas(anon_dup);
|
||||
+
|
||||
/*
|
||||
* This means we have failed to clone anon_vma's correctly, but no
|
||||
* actual changes to VMAs have occurred, so no harm no foul - if the
|
@@ -1,90 +0,0 @@
|
||||
From 7c9d5350d8acfe1b876a8acabdf247b44a803d58 Mon Sep 17 00:00:00 2001
|
||||
From: Ryan Roberts <ryan.roberts@arm.com>
|
||||
Date: Fri, 6 Jun 2025 10:28:07 +0100
|
||||
Subject: mm: close theoretical race where stale TLB entries could linger
|
||||
|
||||
Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a
|
||||
parallel reclaim leaving stale TLB entries") described a theoretical race
|
||||
as such:
|
||||
|
||||
|
||||
"""
|
||||
Nadav Amit identified a theoretical race between page reclaim and mprotect
|
||||
due to TLB flushes being batched outside of the PTL being held.
|
||||
|
||||
He described the race as follows:
|
||||
|
||||
CPU0 CPU1
|
||||
---- ----
|
||||
user accesses memory using RW PTE
|
||||
[PTE now cached in TLB]
|
||||
try_to_unmap_one()
|
||||
==> ptep_get_and_clear()
|
||||
==> set_tlb_ubc_flush_pending()
|
||||
mprotect(addr, PROT_READ)
|
||||
==> change_pte_range()
|
||||
==> [ PTE non-present - no flush ]
|
||||
|
||||
user writes using cached RW PTE
|
||||
...
|
||||
|
||||
try_to_unmap_flush()
|
||||
|
||||
The same type of race exists for reads when protecting for PROT_NONE and
|
||||
also exists for operations that can leave an old TLB entry behind such as
|
||||
munmap, mremap and madvise.
|
||||
"""
|
||||
|
||||
The solution was to introduce flush_tlb_batched_pending() and call it
|
||||
under the PTL from mprotect/madvise/munmap/mremap to complete any pending
|
||||
tlb flushes.
|
||||
|
||||
However, while madvise_free_pte_range() and
|
||||
madvise_cold_or_pageout_pte_range() were both retro-fitted to call
|
||||
flush_tlb_batched_pending() immediately after initially acquiring the PTL,
|
||||
they both temporarily release the PTL to split a large folio if they
|
||||
stumble upon one. In this case, where re-acquiring the PTL
|
||||
flush_tlb_batched_pending() must be called again, but it previously was
|
||||
not. Let's fix that.
|
||||
|
||||
There are 2 Fixes: tags here: the first is the commit that fixed
|
||||
madvise_free_pte_range(). The second is the commit that added
|
||||
madvise_cold_or_pageout_pte_range(), which looks like it copy/pasted the
|
||||
faulty pattern from madvise_free_pte_range().
|
||||
|
||||
This is a theoretical bug discovered during code review.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250606092809.4194056-1-ryan.roberts@arm.com
|
||||
Fixes: 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries")
|
||||
Fixes: 9c276cc65a58 ("mm: introduce MADV_COLD")
|
||||
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
|
||||
Reviewed-by: Jann Horn <jannh@google.com>
|
||||
Acked-by: David Hildenbrand <david@redhat.com>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Cc: Mel Gorman <mgorman <mgorman@suse.de>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/madvise.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/mm/madvise.c
|
||||
+++ b/mm/madvise.c
|
||||
@@ -503,6 +503,7 @@ restart:
|
||||
pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
if (!start_pte)
|
||||
break;
|
||||
+ flush_tlb_batched_pending(mm);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
if (!err)
|
||||
nr = 0;
|
||||
@@ -736,6 +737,7 @@ static int madvise_free_pte_range(pmd_t
|
||||
start_pte = pte;
|
||||
if (!start_pte)
|
||||
break;
|
||||
+ flush_tlb_batched_pending(mm);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
if (!err)
|
||||
nr = 0;
|
@@ -1,33 +0,0 @@
|
||||
From 862a81c79f0bea8ede0352b637b44716f02f71b9 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Fri, 13 Jun 2025 11:01:49 -0600
|
||||
Subject: io_uring/kbuf: don't truncate end buffer for multiple buffer peeks
|
||||
|
||||
If peeking a bunch of buffers, normally io_ring_buffers_peek() will
|
||||
truncate the end buffer. This isn't optimal as presumably more data will
|
||||
be arriving later, and hence it's better to stop with the last full
|
||||
buffer rather than truncate the end buffer.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Fixes: 35c8711c8fc4 ("io_uring/kbuf: add helpers for getting/peeking multiple buffers")
|
||||
Reported-by: Christian Mazakas <christian.mazakas@gmail.com>
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
---
|
||||
io_uring/kbuf.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/io_uring/kbuf.c
|
||||
+++ b/io_uring/kbuf.c
|
||||
@@ -270,8 +270,11 @@ static int io_ring_buffers_peek(struct i
|
||||
/* truncate end piece, if needed, for non partial buffers */
|
||||
if (len > arg->max_len) {
|
||||
len = arg->max_len;
|
||||
- if (!(bl->flags & IOBL_INC))
|
||||
+ if (!(bl->flags & IOBL_INC)) {
|
||||
+ if (iov != arg->iovs)
|
||||
+ break;
|
||||
buf->len = len;
|
||||
+ }
|
||||
}
|
||||
|
||||
iov->iov_base = u64_to_user_ptr(buf->addr);
|
@@ -1,54 +0,0 @@
|
||||
From bb3d761325a1707c8064a3d7dd556ed6a501a2e7 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Fri, 13 Jun 2025 13:37:41 -0600
|
||||
Subject: nvme: always punt polled uring_cmd end_io work to task_work
|
||||
|
||||
Currently NVMe uring_cmd completions will complete locally, if they are
|
||||
polled. This is done because those completions are always invoked from
|
||||
task context. And while that is true, there's no guarantee that it's
|
||||
invoked under the right ring context, or even task. If someone does
|
||||
NVMe passthrough via multiple threads and with a limited number of
|
||||
poll queues, then ringA may find completions from ringB. For that case,
|
||||
completing the request may not be sound.
|
||||
|
||||
Always just punt the passthrough completions via task_work, which will
|
||||
redirect the completion, if needed.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Fixes: 585079b6e425 ("nvme: wire up async polling for io passthrough commands")
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
---
|
||||
drivers/nvme/host/ioctl.c | 21 +++++++--------------
|
||||
1 file changed, 7 insertions(+), 14 deletions(-)
|
||||
|
||||
--- a/drivers/nvme/host/ioctl.c
|
||||
+++ b/drivers/nvme/host/ioctl.c
|
||||
@@ -429,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd
|
||||
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
|
||||
/*
|
||||
- * For iopoll, complete it directly. Note that using the uring_cmd
|
||||
- * helper for this is safe only because we check blk_rq_is_poll().
|
||||
- * As that returns false if we're NOT on a polled queue, then it's
|
||||
- * safe to use the polled completion helper.
|
||||
- *
|
||||
- * Otherwise, move the completion to task work.
|
||||
+ * IOPOLL could potentially complete this request directly, but
|
||||
+ * if multiple rings are polling on the same queue, then it's possible
|
||||
+ * for one ring to find completions for another ring. Punting the
|
||||
+ * completion via task_work will always direct it to the right
|
||||
+ * location, rather than potentially complete requests for ringA
|
||||
+ * under iopoll invocations from ringB.
|
||||
*/
|
||||
- if (blk_rq_is_poll(req)) {
|
||||
- if (pdu->bio)
|
||||
- blk_rq_unmap_user(pdu->bio);
|
||||
- io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
|
||||
- } else {
|
||||
- io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
|
||||
- }
|
||||
-
|
||||
+ io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
|
||||
return RQ_END_IO_FREE;
|
||||
}
|
||||
|
@@ -1,33 +0,0 @@
|
||||
From a57621608b2cbcbd0c7da184e9012b9b111a8577 Mon Sep 17 00:00:00 2001
|
||||
From: Damien Le Moal <dlemoal@kernel.org>
|
||||
Date: Wed, 11 Jun 2025 09:59:15 +0900
|
||||
Subject: block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion
|
||||
|
||||
When blk_zone_write_plug_bio_endio() is called for a regular write BIO
|
||||
used to emulate a zone append operation, that is, a BIO flagged with
|
||||
BIO_EMULATES_ZONE_APPEND, the BIO operation code is restored to the
|
||||
original REQ_OP_ZONE_APPEND but the BIO_EMULATES_ZONE_APPEND flag is not
|
||||
cleared. Clear it to fully return the BIO to its orginal definition.
|
||||
|
||||
Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
|
||||
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
|
||||
Reviewed-by: Hannes Reinecke <hare@suse.de>
|
||||
Reviewed-by: Christoph Hellwig <hch@lst.de>
|
||||
Link: https://lore.kernel.org/r/20250611005915.89843-1-dlemoal@kernel.org
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
---
|
||||
block/blk-zoned.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
--- a/block/blk-zoned.c
|
||||
+++ b/block/blk-zoned.c
|
||||
@@ -1225,6 +1225,7 @@ void blk_zone_write_plug_bio_endio(struc
|
||||
if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
|
||||
bio->bi_opf &= ~REQ_OP_MASK;
|
||||
bio->bi_opf |= REQ_OP_ZONE_APPEND;
|
||||
+ bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
|
||||
}
|
||||
|
||||
/*
|
@@ -1,65 +0,0 @@
|
||||
From 7fc5a2cbcc8459cab6ae8c5dd1220768027ccb70 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Wed, 11 Jun 2025 08:48:46 -0600
|
||||
Subject: block: use plug request list tail for one-shot backmerge attempt
|
||||
|
||||
Previously, the block layer stored the requests in the plug list in
|
||||
LIFO order. For this reason, blk_attempt_plug_merge() would check
|
||||
just the head entry for a back merge attempt, and abort after that
|
||||
unless requests for multiple queues existed in the plug list. If more
|
||||
than one request is present in the plug list, this makes the one-shot
|
||||
back merging less useful than before, as it'll always fail to find a
|
||||
quick merge candidate.
|
||||
|
||||
Use the tail entry for the one-shot merge attempt, which is the last
|
||||
added request in the list. If that fails, abort immediately unless
|
||||
there are multiple queues available. If multiple queues are available,
|
||||
then scan the list. Ideally the latter scan would be a backwards scan
|
||||
of the list, but as it currently stands, the plug list is singly linked
|
||||
and hence this isn't easily feasible.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lore.kernel.org/linux-block/20250611121626.7252-1-abuehaze@amazon.com/
|
||||
Reported-by: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
|
||||
Fixes: e70c301faece ("block: don't reorder requests in blk_add_rq_to_plug")
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
---
|
||||
block/blk-merge.c | 26 +++++++++++++-------------
|
||||
1 file changed, 13 insertions(+), 13 deletions(-)
|
||||
|
||||
--- a/block/blk-merge.c
|
||||
+++ b/block/blk-merge.c
|
||||
@@ -1127,20 +1127,20 @@ bool blk_attempt_plug_merge(struct reque
|
||||
if (!plug || rq_list_empty(&plug->mq_list))
|
||||
return false;
|
||||
|
||||
- rq_list_for_each(&plug->mq_list, rq) {
|
||||
- if (rq->q == q) {
|
||||
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
- BIO_MERGE_OK)
|
||||
- return true;
|
||||
- break;
|
||||
- }
|
||||
+ rq = plug->mq_list.tail;
|
||||
+ if (rq->q == q)
|
||||
+ return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
+ BIO_MERGE_OK;
|
||||
+ else if (!plug->multiple_queues)
|
||||
+ return false;
|
||||
|
||||
- /*
|
||||
- * Only keep iterating plug list for merges if we have multiple
|
||||
- * queues
|
||||
- */
|
||||
- if (!plug->multiple_queues)
|
||||
- break;
|
||||
+ rq_list_for_each(&plug->mq_list, rq) {
|
||||
+ if (rq->q != q)
|
||||
+ continue;
|
||||
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
+ BIO_MERGE_OK)
|
||||
+ return true;
|
||||
+ break;
|
||||
}
|
||||
return false;
|
||||
}
|
@@ -1,149 +0,0 @@
|
||||
From 8ad4520fc849262ab23adbabebd366d4755035bc Mon Sep 17 00:00:00 2001
|
||||
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
|
||||
Date: Tue, 3 Jun 2025 14:14:45 +0300
|
||||
Subject: Revert "mm/execmem: Unify early execmem_cache behaviour"
|
||||
|
||||
The commit d6d1e3e6580c ("mm/execmem: Unify early execmem_cache
|
||||
behaviour") changed early behaviour of execemem ROX cache to allow its
|
||||
usage in early x86 code that allocates text pages when
|
||||
CONFIG_MITGATION_ITS is enabled.
|
||||
|
||||
The permission management of the pages allocated from execmem for ITS
|
||||
mitigation is now completely contained in arch/x86/kernel/alternatives.c
|
||||
and therefore there is no need to special case early allocations in
|
||||
execmem.
|
||||
|
||||
This reverts commit d6d1e3e6580ca35071ad474381f053cbf1fb6414.
|
||||
|
||||
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lkml.kernel.org/r/20250603111446.2609381-6-rppt@kernel.org
|
||||
---
|
||||
arch/x86/mm/init_32.c | 3 ---
|
||||
arch/x86/mm/init_64.c | 3 ---
|
||||
include/linux/execmem.h | 8 +-------
|
||||
mm/execmem.c | 40 +++-------------------------------------
|
||||
4 files changed, 4 insertions(+), 50 deletions(-)
|
||||
|
||||
--- a/arch/x86/mm/init_32.c
|
||||
+++ b/arch/x86/mm/init_32.c
|
||||
@@ -30,7 +30,6 @@
|
||||
#include <linux/initrd.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/gfp.h>
|
||||
-#include <linux/execmem.h>
|
||||
|
||||
#include <asm/asm.h>
|
||||
#include <asm/bios_ebda.h>
|
||||
@@ -756,8 +755,6 @@ void mark_rodata_ro(void)
|
||||
pr_info("Write protecting kernel text and read-only data: %luk\n",
|
||||
size >> 10);
|
||||
|
||||
- execmem_cache_make_ro();
|
||||
-
|
||||
kernel_set_to_readonly = 1;
|
||||
|
||||
#ifdef CONFIG_CPA_DEBUG
|
||||
--- a/arch/x86/mm/init_64.c
|
||||
+++ b/arch/x86/mm/init_64.c
|
||||
@@ -34,7 +34,6 @@
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/kcore.h>
|
||||
#include <linux/bootmem_info.h>
|
||||
-#include <linux/execmem.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
#include <asm/bios_ebda.h>
|
||||
@@ -1392,8 +1391,6 @@ void mark_rodata_ro(void)
|
||||
(end - start) >> 10);
|
||||
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
|
||||
|
||||
- execmem_cache_make_ro();
|
||||
-
|
||||
kernel_set_to_readonly = 1;
|
||||
|
||||
/*
|
||||
--- a/include/linux/execmem.h
|
||||
+++ b/include/linux/execmem.h
|
||||
@@ -54,7 +54,7 @@ enum execmem_range_flags {
|
||||
EXECMEM_ROX_CACHE = (1 << 1),
|
||||
};
|
||||
|
||||
-#if defined(CONFIG_ARCH_HAS_EXECMEM_ROX) && defined(CONFIG_EXECMEM)
|
||||
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
|
||||
/**
|
||||
* execmem_fill_trapping_insns - set memory to contain instructions that
|
||||
* will trap
|
||||
@@ -94,15 +94,9 @@ int execmem_make_temp_rw(void *ptr, size
|
||||
* Return: 0 on success or negative error code on failure.
|
||||
*/
|
||||
int execmem_restore_rox(void *ptr, size_t size);
|
||||
-
|
||||
-/*
|
||||
- * Called from mark_readonly(), where the system transitions to ROX.
|
||||
- */
|
||||
-void execmem_cache_make_ro(void);
|
||||
#else
|
||||
static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; }
|
||||
static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; }
|
||||
-static inline void execmem_cache_make_ro(void) { }
|
||||
#endif
|
||||
|
||||
/**
|
||||
--- a/mm/execmem.c
|
||||
+++ b/mm/execmem.c
|
||||
@@ -254,34 +254,6 @@ out_unlock:
|
||||
return ptr;
|
||||
}
|
||||
|
||||
-static bool execmem_cache_rox = false;
|
||||
-
|
||||
-void execmem_cache_make_ro(void)
|
||||
-{
|
||||
- struct maple_tree *free_areas = &execmem_cache.free_areas;
|
||||
- struct maple_tree *busy_areas = &execmem_cache.busy_areas;
|
||||
- MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
|
||||
- MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
|
||||
- struct mutex *mutex = &execmem_cache.mutex;
|
||||
- void *area;
|
||||
-
|
||||
- execmem_cache_rox = true;
|
||||
-
|
||||
- mutex_lock(mutex);
|
||||
-
|
||||
- mas_for_each(&mas_free, area, ULONG_MAX) {
|
||||
- unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT;
|
||||
- set_memory_ro(mas_free.index, pages);
|
||||
- }
|
||||
-
|
||||
- mas_for_each(&mas_busy, area, ULONG_MAX) {
|
||||
- unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT;
|
||||
- set_memory_ro(mas_busy.index, pages);
|
||||
- }
|
||||
-
|
||||
- mutex_unlock(mutex);
|
||||
-}
|
||||
-
|
||||
static int execmem_cache_populate(struct execmem_range *range, size_t size)
|
||||
{
|
||||
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
|
||||
@@ -302,15 +274,9 @@ static int execmem_cache_populate(struct
|
||||
/* fill memory with instructions that will trap */
|
||||
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
|
||||
|
||||
- if (execmem_cache_rox) {
|
||||
- err = set_memory_rox((unsigned long)p, vm->nr_pages);
|
||||
- if (err)
|
||||
- goto err_free_mem;
|
||||
- } else {
|
||||
- err = set_memory_x((unsigned long)p, vm->nr_pages);
|
||||
- if (err)
|
||||
- goto err_free_mem;
|
||||
- }
|
||||
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
|
||||
+ if (err)
|
||||
+ goto err_free_mem;
|
||||
|
||||
err = execmem_cache_add(p, alloc_size);
|
||||
if (err)
|
@@ -1,63 +0,0 @@
|
||||
From 85bfdd784bd61df94fd42daca141ed173f647e8c Mon Sep 17 00:00:00 2001
|
||||
From: Kai Huang <kai.huang@intel.com>
|
||||
Date: Sat, 7 Jun 2025 01:07:37 +1200
|
||||
Subject: x86/virt/tdx: Avoid indirect calls to TDX assembly functions
|
||||
|
||||
Two 'static inline' TDX helper functions (sc_retry() and
|
||||
sc_retry_prerr()) take function pointer arguments which refer to
|
||||
assembly functions. Normally, the compiler inlines the TDX helper,
|
||||
realizes that the function pointer targets are completely static --
|
||||
thus can be resolved at compile time -- and generates direct call
|
||||
instructions.
|
||||
|
||||
But, other times (like when CONFIG_CC_OPTIMIZE_FOR_SIZE=y), the
|
||||
compiler declines to inline the helpers and will instead generate
|
||||
indirect call instructions.
|
||||
|
||||
Indirect calls to assembly functions require special annotation (for
|
||||
various Control Flow Integrity mechanisms). But TDX assembly
|
||||
functions lack the special annotations and can only be called
|
||||
directly.
|
||||
|
||||
Annotate both the helpers as '__always_inline' to prod the compiler
|
||||
into maintaining the direct calls. There is no guarantee here, but
|
||||
Peter has volunteered to report the compiler bug if this assumption
|
||||
ever breaks[1].
|
||||
|
||||
Fixes: 1e66a7e27539 ("x86/virt/tdx: Handle SEAMCALL no entropy error in common code")
|
||||
Fixes: df01f5ae07dd ("x86/virt/tdx: Add SEAMCALL error printing for module initialization")
|
||||
Signed-off-by: Kai Huang <kai.huang@intel.com>
|
||||
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lore.kernel.org/lkml/20250605145914.GW39944@noisy.programming.kicks-ass.net/ [1]
|
||||
Link: https://lore.kernel.org/all/20250606130737.30713-1-kai.huang%40intel.com
|
||||
---
|
||||
arch/x86/include/asm/tdx.h | 2 +-
|
||||
arch/x86/virt/vmx/tdx/tdx.c | 5 +++--
|
||||
2 files changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tdx.h
|
||||
+++ b/arch/x86/include/asm/tdx.h
|
||||
@@ -100,7 +100,7 @@ void tdx_init(void);
|
||||
|
||||
typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
|
||||
|
||||
-static inline u64 sc_retry(sc_func_t func, u64 fn,
|
||||
+static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
|
||||
struct tdx_module_args *args)
|
||||
{
|
||||
int retry = RDRAND_RETRY_LOOPS;
|
||||
--- a/arch/x86/virt/vmx/tdx/tdx.c
|
||||
+++ b/arch/x86/virt/vmx/tdx/tdx.c
|
||||
@@ -69,8 +69,9 @@ static inline void seamcall_err_ret(u64
|
||||
args->r9, args->r10, args->r11);
|
||||
}
|
||||
|
||||
-static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
|
||||
- u64 fn, struct tdx_module_args *args)
|
||||
+static __always_inline int sc_retry_prerr(sc_func_t func,
|
||||
+ sc_err_func_t err_func,
|
||||
+ u64 fn, struct tdx_module_args *args)
|
||||
{
|
||||
u64 sret = sc_retry(func, fn, args);
|
||||
|
@@ -1,31 +0,0 @@
|
||||
From a94cf5c6e7e31be9d4788916ce847adb15735d81 Mon Sep 17 00:00:00 2001
|
||||
From: Juergen Gross <jgross@suse.com>
|
||||
Date: Tue, 3 Jun 2025 14:14:41 +0300
|
||||
Subject: x86/mm/pat: don't collapse pages without PSE set
|
||||
|
||||
Collapsing pages to a leaf PMD or PUD should be done only if
|
||||
X86_FEATURE_PSE is available, which is not the case when running e.g.
|
||||
as a Xen PV guest.
|
||||
|
||||
Fixes: 41d88484c71c ("x86/mm/pat: restore large ROX pages after fragmentation")
|
||||
Signed-off-by: Juergen Gross <jgross@suse.com>
|
||||
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lore.kernel.org/r/20250528123557.12847-3-jgross@suse.com
|
||||
---
|
||||
arch/x86/mm/pat/set_memory.c | 3 +++
|
||||
1 file changed, 3 insertions(+)
|
||||
|
||||
--- a/arch/x86/mm/pat/set_memory.c
|
||||
+++ b/arch/x86/mm/pat/set_memory.c
|
||||
@@ -1257,6 +1257,9 @@ static int collapse_pmd_page(pmd_t *pmd,
|
||||
pgprot_t pgprot;
|
||||
int i = 0;
|
||||
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_PSE))
|
||||
+ return 0;
|
||||
+
|
||||
addr &= PMD_MASK;
|
||||
pte = pte_offset_kernel(pmd, addr);
|
||||
first = *pte;
|
@@ -1,34 +0,0 @@
|
||||
From 8f28d595d167316469bb33b701e27b4b79c1aab1 Mon Sep 17 00:00:00 2001
|
||||
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
|
||||
Date: Tue, 3 Jun 2025 14:14:42 +0300
|
||||
Subject: x86/Kconfig: only enable ROX cache in execmem when STRICT_MODULE_RWX
|
||||
is set
|
||||
|
||||
Currently ROX cache in execmem is enabled regardless of
|
||||
STRICT_MODULE_RWX setting. This breaks an assumption that module memory
|
||||
is writable when STRICT_MODULE_RWX is disabled, for instance for kernel
|
||||
debuggin.
|
||||
|
||||
Only enable ROX cache in execmem when STRICT_MODULE_RWX is set to
|
||||
restore the original behaviour of module text permissions.
|
||||
|
||||
Fixes: 64f6a4e10c05 ("x86: re-enable EXECMEM_ROX support")
|
||||
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lkml.kernel.org/r/20250603111446.2609381-3-rppt@kernel.org
|
||||
---
|
||||
arch/x86/Kconfig | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/Kconfig
|
||||
+++ b/arch/x86/Kconfig
|
||||
@@ -88,7 +88,7 @@ config X86
|
||||
select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN
|
||||
select ARCH_HAS_EARLY_DEBUG if KGDB
|
||||
select ARCH_HAS_ELF_RANDOMIZE
|
||||
- select ARCH_HAS_EXECMEM_ROX if X86_64
|
||||
+ select ARCH_HAS_EXECMEM_ROX if X86_64 && STRICT_MODULE_RWX
|
||||
select ARCH_HAS_FAST_MULTIPLIER
|
||||
select ARCH_HAS_FORTIFY_SOURCE
|
||||
select ARCH_HAS_GCOV_PROFILE_ALL
|
@@ -1,110 +0,0 @@
|
||||
From 24fd2e3cef1b98f4417b8015ba24a8a4dcaae0c1 Mon Sep 17 00:00:00 2001
|
||||
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
|
||||
Date: Tue, 3 Jun 2025 14:14:43 +0300
|
||||
Subject: x86/its: move its_pages array to struct mod_arch_specific
|
||||
|
||||
The of pages with ITS thunks allocated for modules are tracked by an
|
||||
array in 'struct module'.
|
||||
|
||||
Since this is very architecture specific data structure, move it to
|
||||
'struct mod_arch_specific'.
|
||||
|
||||
No functional changes.
|
||||
|
||||
Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches")
|
||||
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lkml.kernel.org/r/20250603111446.2609381-4-rppt@kernel.org
|
||||
---
|
||||
arch/x86/include/asm/module.h | 8 ++++++++
|
||||
arch/x86/kernel/alternative.c | 19 ++++++++++---------
|
||||
include/linux/module.h | 5 -----
|
||||
3 files changed, 18 insertions(+), 14 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/module.h
|
||||
+++ b/arch/x86/include/asm/module.h
|
||||
@@ -5,12 +5,20 @@
|
||||
#include <asm-generic/module.h>
|
||||
#include <asm/orc_types.h>
|
||||
|
||||
+struct its_array {
|
||||
+#ifdef CONFIG_MITIGATION_ITS
|
||||
+ void **pages;
|
||||
+ int num;
|
||||
+#endif
|
||||
+};
|
||||
+
|
||||
struct mod_arch_specific {
|
||||
#ifdef CONFIG_UNWINDER_ORC
|
||||
unsigned int num_orcs;
|
||||
int *orc_unwind_ip;
|
||||
struct orc_entry *orc_unwind;
|
||||
#endif
|
||||
+ struct its_array its_pages;
|
||||
};
|
||||
|
||||
#endif /* _ASM_X86_MODULE_H */
|
||||
--- a/arch/x86/kernel/alternative.c
|
||||
+++ b/arch/x86/kernel/alternative.c
|
||||
@@ -195,8 +195,8 @@ void its_fini_mod(struct module *mod)
|
||||
its_page = NULL;
|
||||
mutex_unlock(&text_mutex);
|
||||
|
||||
- for (int i = 0; i < mod->its_num_pages; i++) {
|
||||
- void *page = mod->its_page_array[i];
|
||||
+ for (int i = 0; i < mod->arch.its_pages.num; i++) {
|
||||
+ void *page = mod->arch.its_pages.pages[i];
|
||||
execmem_restore_rox(page, PAGE_SIZE);
|
||||
}
|
||||
}
|
||||
@@ -206,11 +206,11 @@ void its_free_mod(struct module *mod)
|
||||
if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
|
||||
return;
|
||||
|
||||
- for (int i = 0; i < mod->its_num_pages; i++) {
|
||||
- void *page = mod->its_page_array[i];
|
||||
+ for (int i = 0; i < mod->arch.its_pages.num; i++) {
|
||||
+ void *page = mod->arch.its_pages.pages[i];
|
||||
execmem_free(page);
|
||||
}
|
||||
- kfree(mod->its_page_array);
|
||||
+ kfree(mod->arch.its_pages.pages);
|
||||
}
|
||||
#endif /* CONFIG_MODULES */
|
||||
|
||||
@@ -223,14 +223,15 @@ static void *its_alloc(void)
|
||||
|
||||
#ifdef CONFIG_MODULES
|
||||
if (its_mod) {
|
||||
- void *tmp = krealloc(its_mod->its_page_array,
|
||||
- (its_mod->its_num_pages+1) * sizeof(void *),
|
||||
+ struct its_array *pages = &its_mod->arch.its_pages;
|
||||
+ void *tmp = krealloc(pages->pages,
|
||||
+ (pages->num+1) * sizeof(void *),
|
||||
GFP_KERNEL);
|
||||
if (!tmp)
|
||||
return NULL;
|
||||
|
||||
- its_mod->its_page_array = tmp;
|
||||
- its_mod->its_page_array[its_mod->its_num_pages++] = page;
|
||||
+ pages->pages = tmp;
|
||||
+ pages->pages[pages->num++] = page;
|
||||
|
||||
execmem_make_temp_rw(page, PAGE_SIZE);
|
||||
}
|
||||
--- a/include/linux/module.h
|
||||
+++ b/include/linux/module.h
|
||||
@@ -586,11 +586,6 @@ struct module {
|
||||
atomic_t refcnt;
|
||||
#endif
|
||||
|
||||
-#ifdef CONFIG_MITIGATION_ITS
|
||||
- int its_num_pages;
|
||||
- void **its_page_array;
|
||||
-#endif
|
||||
-
|
||||
#ifdef CONFIG_CONSTRUCTORS
|
||||
/* Constructor functions. */
|
||||
ctor_fn_t *ctors;
|
@@ -1,148 +0,0 @@
|
||||
From 48d82c4dd03de376a6f673bda0f4f2b97138d855 Mon Sep 17 00:00:00 2001
|
||||
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
|
||||
Date: Tue, 3 Jun 2025 14:14:44 +0300
|
||||
Subject: x86/its: explicitly manage permissions for ITS pages
|
||||
|
||||
execmem_alloc() sets permissions differently depending on the kernel
|
||||
configuration, CPU support for PSE and whether a page is allocated
|
||||
before or after mark_rodata_ro().
|
||||
|
||||
Add tracking for pages allocated for ITS when patching the core kernel
|
||||
and make sure the permissions for ITS pages are explicitly managed for
|
||||
both kernel and module allocations.
|
||||
|
||||
Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches")
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
|
||||
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lkml.kernel.org/r/20250603111446.2609381-5-rppt@kernel.org
|
||||
---
|
||||
arch/x86/kernel/alternative.c | 74 ++++++++++++++++++++++++-----------
|
||||
1 file changed, 52 insertions(+), 22 deletions(-)
|
||||
|
||||
--- a/arch/x86/kernel/alternative.c
|
||||
+++ b/arch/x86/kernel/alternative.c
|
||||
@@ -138,6 +138,24 @@ static struct module *its_mod;
|
||||
#endif
|
||||
static void *its_page;
|
||||
static unsigned int its_offset;
|
||||
+struct its_array its_pages;
|
||||
+
|
||||
+static void *__its_alloc(struct its_array *pages)
|
||||
+{
|
||||
+ void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
|
||||
+ if (!page)
|
||||
+ return NULL;
|
||||
+
|
||||
+ void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
|
||||
+ GFP_KERNEL);
|
||||
+ if (!tmp)
|
||||
+ return NULL;
|
||||
+
|
||||
+ pages->pages = tmp;
|
||||
+ pages->pages[pages->num++] = page;
|
||||
+
|
||||
+ return no_free_ptr(page);
|
||||
+}
|
||||
|
||||
/* Initialize a thunk with the "jmp *reg; int3" instructions. */
|
||||
static void *its_init_thunk(void *thunk, int reg)
|
||||
@@ -173,6 +191,21 @@ static void *its_init_thunk(void *thunk,
|
||||
return thunk + offset;
|
||||
}
|
||||
|
||||
+static void its_pages_protect(struct its_array *pages)
|
||||
+{
|
||||
+ for (int i = 0; i < pages->num; i++) {
|
||||
+ void *page = pages->pages[i];
|
||||
+ execmem_restore_rox(page, PAGE_SIZE);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void its_fini_core(void)
|
||||
+{
|
||||
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
|
||||
+ its_pages_protect(&its_pages);
|
||||
+ kfree(its_pages.pages);
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_MODULES
|
||||
void its_init_mod(struct module *mod)
|
||||
{
|
||||
@@ -195,10 +228,8 @@ void its_fini_mod(struct module *mod)
|
||||
its_page = NULL;
|
||||
mutex_unlock(&text_mutex);
|
||||
|
||||
- for (int i = 0; i < mod->arch.its_pages.num; i++) {
|
||||
- void *page = mod->arch.its_pages.pages[i];
|
||||
- execmem_restore_rox(page, PAGE_SIZE);
|
||||
- }
|
||||
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
|
||||
+ its_pages_protect(&mod->arch.its_pages);
|
||||
}
|
||||
|
||||
void its_free_mod(struct module *mod)
|
||||
@@ -216,28 +247,23 @@ void its_free_mod(struct module *mod)
|
||||
|
||||
static void *its_alloc(void)
|
||||
{
|
||||
- void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
|
||||
+ struct its_array *pages = &its_pages;
|
||||
+ void *page;
|
||||
|
||||
+#ifdef CONFIG_MODULE
|
||||
+ if (its_mod)
|
||||
+ pages = &its_mod->arch.its_pages;
|
||||
+#endif
|
||||
+
|
||||
+ page = __its_alloc(pages);
|
||||
if (!page)
|
||||
return NULL;
|
||||
|
||||
-#ifdef CONFIG_MODULES
|
||||
- if (its_mod) {
|
||||
- struct its_array *pages = &its_mod->arch.its_pages;
|
||||
- void *tmp = krealloc(pages->pages,
|
||||
- (pages->num+1) * sizeof(void *),
|
||||
- GFP_KERNEL);
|
||||
- if (!tmp)
|
||||
- return NULL;
|
||||
-
|
||||
- pages->pages = tmp;
|
||||
- pages->pages[pages->num++] = page;
|
||||
+ execmem_make_temp_rw(page, PAGE_SIZE);
|
||||
+ if (pages == &its_pages)
|
||||
+ set_memory_x((unsigned long)page, 1);
|
||||
|
||||
- execmem_make_temp_rw(page, PAGE_SIZE);
|
||||
- }
|
||||
-#endif /* CONFIG_MODULES */
|
||||
-
|
||||
- return no_free_ptr(page);
|
||||
+ return page;
|
||||
}
|
||||
|
||||
static void *its_allocate_thunk(int reg)
|
||||
@@ -291,7 +317,9 @@ u8 *its_static_thunk(int reg)
|
||||
return thunk;
|
||||
}
|
||||
|
||||
-#endif
|
||||
+#else
|
||||
+static inline void its_fini_core(void) {}
|
||||
+#endif /* CONFIG_MITIGATION_ITS */
|
||||
|
||||
/*
|
||||
* Nomenclature for variable names to simplify and clarify this code and ease
|
||||
@@ -2368,6 +2396,8 @@ void __init alternative_instructions(voi
|
||||
apply_retpolines(__retpoline_sites, __retpoline_sites_end);
|
||||
apply_returns(__return_sites, __return_sites_end);
|
||||
|
||||
+ its_fini_core();
|
||||
+
|
||||
/*
|
||||
* Adjust all CALL instructions to point to func()-10, including
|
||||
* those in .altinstr_replacement.
|
@@ -1,32 +0,0 @@
|
||||
From 9bed8caa4c73f2d524d9600c74e6cbcff71c2456 Mon Sep 17 00:00:00 2001
|
||||
From: Yosry Ahmed <yosry.ahmed@linux.dev>
|
||||
Date: Tue, 29 Apr 2025 08:32:15 -0700
|
||||
Subject: KVM: SVM: Clear current_vmcb during vCPU free for all *possible* CPUs
|
||||
|
||||
When freeing a vCPU and thus its VMCB, clear current_vmcb for all possible
|
||||
CPUs, not just online CPUs, as it's theoretically possible a CPU could go
|
||||
offline and come back online in conjunction with KVM reusing the page for
|
||||
a new VMCB.
|
||||
|
||||
Link: https://lore.kernel.org/all/20250320013759.3965869-1-yosry.ahmed@linux.dev
|
||||
Fixes: fd65d3142f73 ("kvm: svm: Ensure an IBPB on all affected CPUs when freeing a vmcb")
|
||||
Cc: stable@vger.kernel.org
|
||||
Cc: Jim Mattson <jmattson@google.com>
|
||||
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
|
||||
[sean: split to separate patch, write changelog]
|
||||
Signed-off-by: Sean Christopherson <seanjc@google.com>
|
||||
---
|
||||
arch/x86/kvm/svm/svm.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/kvm/svm/svm.c
|
||||
+++ b/arch/x86/kvm/svm/svm.c
|
||||
@@ -1488,7 +1488,7 @@ static void svm_clear_current_vmcb(struc
|
||||
{
|
||||
int i;
|
||||
|
||||
- for_each_online_cpu(i)
|
||||
+ for_each_possible_cpu(i)
|
||||
cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
|
||||
}
|
||||
|
@@ -1,43 +0,0 @@
|
||||
From d74cb6c8b70d9b5ad8482f4821679b83bad9de63 Mon Sep 17 00:00:00 2001
|
||||
From: Chao Gao <chao.gao@intel.com>
|
||||
Date: Mon, 24 Mar 2025 22:08:48 +0800
|
||||
Subject: KVM: VMX: Flush shadow VMCS on emergency reboot
|
||||
|
||||
Ensure the shadow VMCS cache is evicted during an emergency reboot to
|
||||
prevent potential memory corruption if the cache is evicted after reboot.
|
||||
|
||||
This issue was identified through code inspection, as __loaded_vmcs_clear()
|
||||
flushes both the normal VMCS and the shadow VMCS.
|
||||
|
||||
Avoid checking the "launched" state during an emergency reboot, unlike the
|
||||
behavior in __loaded_vmcs_clear(). This is important because reboot NMIs
|
||||
can interfere with operations like copy_shadow_to_vmcs12(), where shadow
|
||||
VMCSes are loaded directly using VMPTRLD. In such cases, if NMIs occur
|
||||
right after the VMCS load, the shadow VMCSes will be active but the
|
||||
"launched" state may not be set.
|
||||
|
||||
Fixes: 16f5b9034b69 ("KVM: nVMX: Copy processor-specific shadow-vmcs to VMCS12")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Chao Gao <chao.gao@intel.com>
|
||||
Reviewed-by: Kai Huang <kai.huang@intel.com>
|
||||
Link: https://lore.kernel.org/r/20250324140849.2099723-1-chao.gao@intel.com
|
||||
Signed-off-by: Sean Christopherson <seanjc@google.com>
|
||||
---
|
||||
arch/x86/kvm/vmx/vmx.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/kvm/vmx/vmx.c
|
||||
+++ b/arch/x86/kvm/vmx/vmx.c
|
||||
@@ -769,8 +769,11 @@ void vmx_emergency_disable_virtualizatio
|
||||
return;
|
||||
|
||||
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
|
||||
- loaded_vmcss_on_cpu_link)
|
||||
+ loaded_vmcss_on_cpu_link) {
|
||||
vmcs_clear(v->vmcs);
|
||||
+ if (v->shadow_vmcs)
|
||||
+ vmcs_clear(v->shadow_vmcs);
|
||||
+ }
|
||||
|
||||
kvm_cpu_vmxoff();
|
||||
}
|
@@ -1,64 +0,0 @@
|
||||
From 6e492900893c011cbe13fbb881cf1e11df08982b Mon Sep 17 00:00:00 2001
|
||||
From: Chen Ridong <chenridong@huawei.com>
|
||||
Date: Wed, 18 Jun 2025 07:32:17 +0000
|
||||
Subject: cgroup,freezer: fix incomplete freezing when attaching tasks
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
An issue was found:
|
||||
|
||||
# cd /sys/fs/cgroup/freezer/
|
||||
# mkdir test
|
||||
# echo FROZEN > test/freezer.state
|
||||
# cat test/freezer.state
|
||||
FROZEN
|
||||
# sleep 1000 &
|
||||
[1] 863
|
||||
# echo 863 > test/cgroup.procs
|
||||
# cat test/freezer.state
|
||||
FREEZING
|
||||
|
||||
When tasks are migrated to a frozen cgroup, the freezer fails to
|
||||
immediately freeze the tasks, causing the cgroup to remain in the
|
||||
"FREEZING".
|
||||
|
||||
The freeze_task() function is called before clearing the CGROUP_FROZEN
|
||||
flag. This causes the freezing() check to incorrectly return false,
|
||||
preventing __freeze_task() from being invoked for the migrated task.
|
||||
|
||||
To fix this issue, clear the CGROUP_FROZEN state before calling
|
||||
freeze_task().
|
||||
|
||||
Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic")
|
||||
Cc: stable@vger.kernel.org # v6.1+
|
||||
Reported-by: Zhong Jiawei <zhongjiawei1@huawei.com>
|
||||
Signed-off-by: Chen Ridong <chenridong@huawei.com>
|
||||
Acked-by: Michal Koutný <mkoutny@suse.com>
|
||||
Signed-off-by: Tejun Heo <tj@kernel.org>
|
||||
---
|
||||
kernel/cgroup/legacy_freezer.c | 3 +--
|
||||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
|
||||
index 039d1eb2f215..507b8f19a262 100644
|
||||
--- a/kernel/cgroup/legacy_freezer.c
|
||||
+++ b/kernel/cgroup/legacy_freezer.c
|
||||
@@ -188,13 +188,12 @@ static void freezer_attach(struct cgroup_taskset *tset)
|
||||
if (!(freezer->state & CGROUP_FREEZING)) {
|
||||
__thaw_task(task);
|
||||
} else {
|
||||
- freeze_task(task);
|
||||
-
|
||||
/* clear FROZEN and propagate upwards */
|
||||
while (freezer && (freezer->state & CGROUP_FROZEN)) {
|
||||
freezer->state &= ~CGROUP_FROZEN;
|
||||
freezer = parent_freezer(freezer);
|
||||
}
|
||||
+ freeze_task(task);
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.50.0
|
||||
|
Reference in New Issue
Block a user