release 6.15.2 (preliminary)

2025-06-18 12:24:58 +03:00
parent 4d2691343a
commit 43dc655d2e
242 changed files with 7729 additions and 32303 deletions
--- a/debian/patches/patchset-pf/fixes/0001-Kunit-to-check-the-longest-symbol-length.patch
+++ b/debian/patches/patchset-pf/fixes/0001-Kunit-to-check-the-longest-symbol-length.patch
@@ -1,176 +0,0 @@
-From 4506de20739ac4726a258faa98609a552184d2d2 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Sergio=20Gonz=C3=A1lez=20Collado?=
- <sergio.collado@gmail.com>
-Date: Sun, 2 Mar 2025 23:15:18 +0100
-Subject: Kunit to check the longest symbol length
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The longest length of a symbol (KSYM_NAME_LEN) was increased to 512
-in the reference [1]. This patch adds kunit test suite to check the longest
-symbol length. These tests verify that the longest symbol length defined
-is supported.
-
-This test can also help other efforts for longer symbol length,
-like [2].
-
-The test suite defines one symbol with the longest possible length.
-
-The first test verify that functions with names of the created
-symbol, can be called or not.
-
-The second test, verify that the symbols are created (or
-not) in the kernel symbol table.
-
-[1] https://lore.kernel.org/lkml/20220802015052.10452-6-ojeda@kernel.org/
-[2] https://lore.kernel.org/lkml/20240605032120.3179157-1-song@kernel.org/
-
-Tested-by: Martin Rodriguez Reboredo <yakoyoku@gmail.com>
-Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
-Reviewed-by: Rae Moar <rmoar@google.com>
-Signed-off-by: Sergio González Collado <sergio.collado@gmail.com>
-Link: https://github.com/Rust-for-Linux/linux/issues/504
-Source: https://lore.kernel.org/rust-for-linux/20250302221518.76874-1-sergio.collado@gmail.com/
-Cherry-picked-for: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/63
---
- arch/x86/tools/insn_decoder_test.c |  3 +-
- lib/Kconfig.debug                  |  9 ++++
- lib/Makefile                       |  2 +
- lib/longest_symbol_kunit.c         | 82 ++++++++++++++++++++++++++++++
- 4 files changed, 95 insertions(+), 1 deletion(-)
- create mode 100644 lib/longest_symbol_kunit.c
-
--- a/arch/x86/tools/insn_decoder_test.c
-+++ b/arch/x86/tools/insn_decoder_test.c
-@@ -10,6 +10,7 @@
- #include <assert.h>
- #include <unistd.h>
- #include <stdarg.h>
-+#include <linux/kallsyms.h>
- 
- #define unlikely(cond) (cond)
- 
-@@ -106,7 +107,7 @@ static void parse_args(int argc, char **
- 	}
- }
- 
-#define BUFSIZE 256
-+#define BUFSIZE (256 + KSYM_NAME_LEN)
- 
- int main(int argc, char **argv)
- {
--- a/lib/Kconfig.debug
-+++ b/lib/Kconfig.debug
-@@ -2838,6 +2838,15 @@ config FORTIFY_KUNIT_TEST
- 	  by the str*() and mem*() family of functions. For testing runtime
- 	  traps of FORTIFY_SOURCE, see LKDTM's "FORTIFY_*" tests.
- 
-+config LONGEST_SYM_KUNIT_TEST
-+	tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS
-+	depends on KUNIT && KPROBES
-+	default KUNIT_ALL_TESTS
-+	help
-+	  Tests the longest symbol possible
-+
-+	  If unsure, say N.
-+
- config HW_BREAKPOINT_KUNIT_TEST
- 	bool "Test hw_breakpoint constraints accounting" if !KUNIT_ALL_TESTS
- 	depends on HAVE_HW_BREAKPOINT
--- a/lib/Makefile
-+++ b/lib/Makefile
-@@ -398,6 +398,8 @@ obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fort
- obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o
- obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o
- obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o
-+obj-$(CONFIG_LONGEST_SYM_KUNIT_TEST) += longest_symbol_kunit.o
-+CFLAGS_longest_symbol_kunit.o += $(call cc-disable-warning, missing-prototypes)
- 
- obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o
- 
--- /dev/null
-+++ b/lib/longest_symbol_kunit.c
-@@ -0,0 +1,82 @@
-+// SPDX-License-Identifier: GPL-2.0
-+/*
-+ * Test the longest symbol length. Execute with:
-+ *  ./tools/testing/kunit/kunit.py run longest-symbol
-+ *  --arch=x86_64 --kconfig_add CONFIG_KPROBES=y --kconfig_add CONFIG_MODULES=y
-+ *  --kconfig_add CONFIG_RETPOLINE=n --kconfig_add CONFIG_CFI_CLANG=n
-+ *  --kconfig_add CONFIG_MITIGATION_RETPOLINE=n
-+ */
-+
-+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-+
-+#include <kunit/test.h>
-+#include <linux/stringify.h>
-+#include <linux/kprobes.h>
-+#include <linux/kallsyms.h>
-+
-+#define DI(name) s##name##name
-+#define DDI(name) DI(n##name##name)
-+#define DDDI(name) DDI(n##name##name)
-+#define DDDDI(name) DDDI(n##name##name)
-+#define DDDDDI(name) DDDDI(n##name##name)
-+
-+/*Generate a symbol whose name length is 511 */
-+#define LONGEST_SYM_NAME  DDDDDI(g1h2i3j4k5l6m7n)
-+
-+#define RETURN_LONGEST_SYM 0xAAAAA
-+
-+noinline int LONGEST_SYM_NAME(void);
-+noinline int LONGEST_SYM_NAME(void)
-+{
-+	return RETURN_LONGEST_SYM;
-+}
-+
-+_Static_assert(sizeof(__stringify(LONGEST_SYM_NAME)) == KSYM_NAME_LEN,
-+"Incorrect symbol length found. Expected KSYM_NAME_LEN: "
-+__stringify(KSYM_NAME_LEN) ", but found: "
-+__stringify(sizeof(LONGEST_SYM_NAME)));
-+
-+static void test_longest_symbol(struct kunit *test)
-+{
-+	KUNIT_EXPECT_EQ(test, RETURN_LONGEST_SYM, LONGEST_SYM_NAME());
-+};
-+
-+static void test_longest_symbol_kallsyms(struct kunit *test)
-+{
-+	unsigned long (*kallsyms_lookup_name)(const char *name);
-+	static int (*longest_sym)(void);
-+
-+	struct kprobe kp = {
-+		.symbol_name = "kallsyms_lookup_name",
-+	};
-+
-+	if (register_kprobe(&kp) < 0) {
-+		pr_info("%s: kprobe not registered", __func__);
-+		KUNIT_FAIL(test, "test_longest_symbol kallsyms: kprobe not registered\n");
-+		return;
-+	}
-+
-+	kunit_warn(test, "test_longest_symbol kallsyms: kprobe registered\n");
-+	kallsyms_lookup_name = (unsigned long (*)(const char *name))kp.addr;
-+	unregister_kprobe(&kp);
-+
-+	longest_sym =
-+		(void *) kallsyms_lookup_name(__stringify(LONGEST_SYM_NAME));
-+	KUNIT_EXPECT_EQ(test, RETURN_LONGEST_SYM, longest_sym());
-+};
-+
-+static struct kunit_case longest_symbol_test_cases[] = {
-+	KUNIT_CASE(test_longest_symbol),
-+	KUNIT_CASE(test_longest_symbol_kallsyms),
-+	{}
-+};
-+
-+static struct kunit_suite longest_symbol_test_suite = {
-+	.name = "longest-symbol",
-+	.test_cases = longest_symbol_test_cases,
-+};
-+kunit_test_suite(longest_symbol_test_suite);
-+
-+MODULE_LICENSE("GPL");
-+MODULE_DESCRIPTION("Test the longest symbol length");
-+MODULE_AUTHOR("Sergio González Collado");
--- a/debian/patches/patchset-pf/fixes/0001-mm-fix-ratelimit_pages-update-error-in-dirty_ratio_h.patch
+++ b/debian/patches/patchset-pf/fixes/0001-mm-fix-ratelimit_pages-update-error-in-dirty_ratio_h.patch
@@ -0,0 +1,70 @@
+From cda8b1022f32bb7a917148f75f4641e7a5b3e729 Mon Sep 17 00:00:00 2001
+From: Jinliang Zheng <alexjlzheng@tencent.com>
+Date: Tue, 15 Apr 2025 17:02:32 +0800
+Subject: mm: fix ratelimit_pages update error in dirty_ratio_handler()
+
+In dirty_ratio_handler(), vm_dirty_bytes must be set to zero before
+calling writeback_set_ratelimit(), as global_dirty_limits() always
+prioritizes the value of vm_dirty_bytes.
+
+It's domain_dirty_limits() that's relevant here, not node_dirty_ok:
+
+  dirty_ratio_handler
+    writeback_set_ratelimit
+      global_dirty_limits(&dirty_thresh)           <- ratelimit_pages based on dirty_thresh
+        domain_dirty_limits
+          if (bytes)                               <- bytes = vm_dirty_bytes <--------+
+            thresh = f1(bytes)                     <- prioritizes vm_dirty_bytes      |
+          else                                                                        |
+            thresh = f2(ratio)                                                        |
+      ratelimit_pages = f3(dirty_thresh)                                              |
+    vm_dirty_bytes = 0                             <- it's late! ---------------------+
+
+This causes ratelimit_pages to still use the value calculated based on
+vm_dirty_bytes, which is wrong now.
+
+
+The impact visible to userspace is difficult to capture directly because
+there is no procfs/sysfs interface exported to user space.  However, it
+will have a real impact on the balance of dirty pages.
+
+For example:
+
+1. On default, we have vm_dirty_ratio=40, vm_dirty_bytes=0
+
+2. echo 8192 > dirty_bytes, then vm_dirty_bytes=8192,
+   vm_dirty_ratio=0, and ratelimit_pages is calculated based on
+   vm_dirty_bytes now.
+
+3. echo 20 > dirty_ratio, then since vm_dirty_bytes is not reset to
+   zero when writeback_set_ratelimit() -> global_dirty_limits() ->
+   domain_dirty_limits() is called, reallimit_pages is still calculated
+   based on vm_dirty_bytes instead of vm_dirty_ratio.  This does not
+   conform to the actual intent of the user.
+
+Link: https://lkml.kernel.org/r/20250415090232.7544-1-alexjlzheng@tencent.com
+Fixes: 9d823e8f6b1b ("writeback: per task dirty rate limit")
+Signed-off-by: Jinliang Zheng <alexjlzheng@tencent.com>
+Reviewed-by: MengEn Sun <mengensun@tencent.com>
+Cc: Andrea Righi <andrea@betterlinux.com>
+Cc: Fenggaung Wu <fengguang.wu@intel.com>
+Cc: Jinliang Zheng <alexjlzheng@tencent.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ mm/page-writeback.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
+@@ -520,8 +520,8 @@ static int dirty_ratio_handler(const str
+ 
+ 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ 	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+-		writeback_set_ratelimit();
+ 		vm_dirty_bytes = 0;
+		writeback_set_ratelimit();
+ 	}
+ 	return ret;
+ }
--- a/debian/patches/patchset-pf/fixes/0002-vgacon-Add-check-for-vc_origin-address-range-in-vgac.patch
+++ b/debian/patches/patchset-pf/fixes/0002-vgacon-Add-check-for-vc_origin-address-range-in-vgac.patch
@@ -0,0 +1,179 @@
+From 30a724581b5037176f6492359c189ebb180ccf1f Mon Sep 17 00:00:00 2001
+From: GONG Ruiqi <gongruiqi1@huawei.com>
+Date: Sun, 27 Apr 2025 10:53:03 +0800
+Subject: vgacon: Add check for vc_origin address range in vgacon_scroll()
+
+Our in-house Syzkaller reported the following BUG (twice), which we
+believed was the same issue with [1]:
+
+==================================================================
+BUG: KASAN: slab-out-of-bounds in vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
+Read of size 2 at addr ffff88800f5bef60 by task syz.7.2620/12393
+...
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:88 [inline]
+ dump_stack_lvl+0x72/0xa0 lib/dump_stack.c:106
+ print_address_description.constprop.0+0x6b/0x3d0 mm/kasan/report.c:364
+ print_report+0xba/0x280 mm/kasan/report.c:475
+ kasan_report+0xa9/0xe0 mm/kasan/report.c:588
+ vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
+ vcs_write_buf_noattr drivers/tty/vt/vc_screen.c:493 [inline]
+ vcs_write+0x586/0x840 drivers/tty/vt/vc_screen.c:690
+ vfs_write+0x219/0x960 fs/read_write.c:584
+ ksys_write+0x12e/0x260 fs/read_write.c:639
+ do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+ do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
+ entry_SYSCALL_64_after_hwframe+0x78/0xe2
+ ...
+ </TASK>
+
+Allocated by task 5614:
+ kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
+ kasan_set_track+0x25/0x30 mm/kasan/common.c:52
+ ____kasan_kmalloc mm/kasan/common.c:374 [inline]
+ __kasan_kmalloc+0x8f/0xa0 mm/kasan/common.c:383
+ kasan_kmalloc include/linux/kasan.h:201 [inline]
+ __do_kmalloc_node mm/slab_common.c:1007 [inline]
+ __kmalloc+0x62/0x140 mm/slab_common.c:1020
+ kmalloc include/linux/slab.h:604 [inline]
+ kzalloc include/linux/slab.h:721 [inline]
+ vc_do_resize+0x235/0xf40 drivers/tty/vt/vt.c:1193
+ vgacon_adjust_height+0x2d4/0x350 drivers/video/console/vgacon.c:1007
+ vgacon_font_set+0x1f7/0x240 drivers/video/console/vgacon.c:1031
+ con_font_set drivers/tty/vt/vt.c:4628 [inline]
+ con_font_op+0x4da/0xa20 drivers/tty/vt/vt.c:4675
+ vt_k_ioctl+0xa10/0xb30 drivers/tty/vt/vt_ioctl.c:474
+ vt_ioctl+0x14c/0x1870 drivers/tty/vt/vt_ioctl.c:752
+ tty_ioctl+0x655/0x1510 drivers/tty/tty_io.c:2779
+ vfs_ioctl fs/ioctl.c:51 [inline]
+ __do_sys_ioctl fs/ioctl.c:871 [inline]
+ __se_sys_ioctl+0x12d/0x190 fs/ioctl.c:857
+ do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+ do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
+ entry_SYSCALL_64_after_hwframe+0x78/0xe2
+
+Last potentially related work creation:
+ kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
+ __kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
+ __call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
+ netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
+ __sock_release+0xb5/0x270 net/socket.c:663
+ sock_close+0x1e/0x30 net/socket.c:1425
+ __fput+0x408/0xab0 fs/file_table.c:384
+ __fput_sync+0x4c/0x60 fs/file_table.c:465
+ __do_sys_close fs/open.c:1580 [inline]
+ __se_sys_close+0x68/0xd0 fs/open.c:1565
+ do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+ do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
+ entry_SYSCALL_64_after_hwframe+0x78/0xe2
+
+Second to last potentially related work creation:
+ kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
+ __kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
+ __call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
+ netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
+ __sock_release+0xb5/0x270 net/socket.c:663
+ sock_close+0x1e/0x30 net/socket.c:1425
+ __fput+0x408/0xab0 fs/file_table.c:384
+ task_work_run+0x154/0x240 kernel/task_work.c:239
+ exit_task_work include/linux/task_work.h:45 [inline]
+ do_exit+0x8e5/0x1320 kernel/exit.c:874
+ do_group_exit+0xcd/0x280 kernel/exit.c:1023
+ get_signal+0x1675/0x1850 kernel/signal.c:2905
+ arch_do_signal_or_restart+0x80/0x3b0 arch/x86/kernel/signal.c:310
+ exit_to_user_mode_loop kernel/entry/common.c:111 [inline]
+ exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
+ __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline]
+ syscall_exit_to_user_mode+0x1b3/0x1e0 kernel/entry/common.c:218
+ do_syscall_64+0x66/0x110 arch/x86/entry/common.c:87
+ entry_SYSCALL_64_after_hwframe+0x78/0xe2
+
+The buggy address belongs to the object at ffff88800f5be000
+ which belongs to the cache kmalloc-2k of size 2048
+The buggy address is located 2656 bytes to the right of
+ allocated 1280-byte region [ffff88800f5be000, ffff88800f5be500)
+
+...
+
+Memory state around the buggy address:
+ ffff88800f5bee00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff88800f5bee80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+>ffff88800f5bef00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+                                                       ^
+ ffff88800f5bef80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff88800f5bf000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+==================================================================
+
+By analyzing the vmcore, we found that vc->vc_origin was somehow placed
+one line prior to vc->vc_screenbuf when vc was in KD_TEXT mode, and
+further writings to /dev/vcs caused out-of-bounds reads (and writes
+right after) in vcs_write_buf_noattr().
+
+Our further experiments show that in most cases, vc->vc_origin equals to
+vga_vram_base when the console is in KD_TEXT mode, and it's around
+vc->vc_screenbuf for the KD_GRAPHICS mode. But via triggerring a
+TIOCL_SETVESABLANK ioctl beforehand, we can make vc->vc_origin be around
+vc->vc_screenbuf while the console is in KD_TEXT mode, and then by
+writing the special 'ESC M' control sequence to the tty certain times
+(depends on the value of `vc->state.y - vc->vc_top`), we can eventually
+move vc->vc_origin prior to vc->vc_screenbuf. Here's the PoC, tested on
+QEMU:
+
+```
+int main() {
+	const int RI_NUM = 10; // should be greater than `vc->state.y - vc->vc_top`
+	int tty_fd, vcs_fd;
+	const char *tty_path = "/dev/tty0";
+	const char *vcs_path = "/dev/vcs";
+	const char escape_seq[] = "\x1bM";  // ESC + M
+	const char trigger_seq[] = "Let's trigger an OOB write.";
+	struct vt_sizes vt_size = { 70, 2 };
+	int blank = TIOCL_BLANKSCREEN;
+
+	tty_fd = open(tty_path, O_RDWR);
+
+	char vesa_mode[] = { TIOCL_SETVESABLANK, 1 };
+	ioctl(tty_fd, TIOCLINUX, vesa_mode);
+
+	ioctl(tty_fd, TIOCLINUX, &blank);
+	ioctl(tty_fd, VT_RESIZE, &vt_size);
+
+	for (int i = 0; i < RI_NUM; ++i)
+		write(tty_fd, escape_seq, sizeof(escape_seq) - 1);
+
+	vcs_fd = open(vcs_path, O_RDWR);
+	write(vcs_fd, trigger_seq, sizeof(trigger_seq));
+
+	close(vcs_fd);
+	close(tty_fd);
+	return 0;
+}
+```
+
+To solve this problem, add an address range validation check in
+vgacon_scroll(), ensuring vc->vc_origin never precedes vc_screenbuf.
+
+Reported-by: syzbot+9c09fda97a1a65ea859b@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=9c09fda97a1a65ea859b [1]
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Cc: stable@vger.kernel.org
+Co-developed-by: Yi Yang <yiyang13@huawei.com>
+Signed-off-by: Yi Yang <yiyang13@huawei.com>
+Signed-off-by: GONG Ruiqi <gongruiqi1@huawei.com>
+Signed-off-by: Helge Deller <deller@gmx.de>
+---
+ drivers/video/console/vgacon.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
+@@ -1168,7 +1168,7 @@ static bool vgacon_scroll(struct vc_data
+ 				     c->vc_screenbuf_size - delta);
+ 			c->vc_origin = vga_vram_end - c->vc_screenbuf_size;
+ 			vga_rolled_over = 0;
+-		} else
+		} else if (oldo - delta >= (unsigned long)c->vc_screenbuf)
+ 			c->vc_origin -= delta;
+ 		c->vc_scr_end = c->vc_origin + c->vc_screenbuf_size;
+ 		scr_memsetw((u16 *) (c->vc_origin), c->vc_video_erase_char,
--- a/debian/patches/patchset-pf/fixes/0002-x86-tools-Drop-duplicate-unlikely-definition-in-insn.patch
+++ b/debian/patches/patchset-pf/fixes/0002-x86-tools-Drop-duplicate-unlikely-definition-in-insn.patch
@@ -1,36 +0,0 @@
-From b5a4b82efd19d0687a5582a58f6830bf714e34fc Mon Sep 17 00:00:00 2001
-From: Nathan Chancellor <nathan@kernel.org>
-Date: Tue, 18 Mar 2025 15:32:30 -0700
-Subject: x86/tools: Drop duplicate unlikely() definition in
- insn_decoder_test.c
-
-After commit c104c16073b7 ("Kunit to check the longest symbol length"),
-there is a warning when building with clang because there is now a
-definition of unlikely from compiler.h in tools/include/linux, which
-conflicts with the one in the instruction decoder selftest:
-
-  arch/x86/tools/insn_decoder_test.c:15:9: warning: 'unlikely' macro redefined [-Wmacro-redefined]
-
-Remove the second unlikely() definition, as it is no longer necessary,
-clearing up the warning.
-
-Fixes: c104c16073b7 ("Kunit to check the longest symbol length")
-Signed-off-by: Nathan Chancellor <nathan@kernel.org>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Acked-by: Shuah Khan <skhan@linuxfoundation.org>
-Link: https://lore.kernel.org/r/20250318-x86-decoder-test-fix-unlikely-redef-v1-1-74c84a7bf05b@kernel.org
---
- arch/x86/tools/insn_decoder_test.c | 2 --
- 1 file changed, 2 deletions(-)
-
--- a/arch/x86/tools/insn_decoder_test.c
-+++ b/arch/x86/tools/insn_decoder_test.c
-@@ -12,8 +12,6 @@
- #include <stdarg.h>
- #include <linux/kallsyms.h>
- 
-#define unlikely(cond) (cond)
-
- #include <asm/insn.h>
- #include <inat.c>
- #include <insn.c>
--- a/debian/patches/patchset-pf/fixes/0003-fbdev-Fix-do_register_framebuffer-to-prevent-null-pt.patch
+++ b/debian/patches/patchset-pf/fixes/0003-fbdev-Fix-do_register_framebuffer-to-prevent-null-pt.patch
@@ -0,0 +1,102 @@
+From 5cf26cf9fd9c11cb1543aac026f8928829895663 Mon Sep 17 00:00:00 2001
+From: Murad Masimov <m.masimov@mt-integration.ru>
+Date: Mon, 28 Apr 2025 18:34:06 +0300
+Subject: fbdev: Fix do_register_framebuffer to prevent null-ptr-deref in
+ fb_videomode_to_var
+
+If fb_add_videomode() in do_register_framebuffer() fails to allocate
+memory for fb_videomode, it will later lead to a null-ptr dereference in
+fb_videomode_to_var(), as the fb_info is registered while not having the
+mode in modelist that is expected to be there, i.e. the one that is
+described in fb_info->var.
+
+================================================================
+general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
+KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
+CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
+RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
+Call Trace:
+ display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
+ fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
+ resize_screen drivers/tty/vt/vt.c:1176 [inline]
+ vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
+ fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
+ fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
+ do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
+ fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
+ vfs_ioctl fs/ioctl.c:48 [inline]
+ __do_sys_ioctl fs/ioctl.c:753 [inline]
+ __se_sys_ioctl fs/ioctl.c:739 [inline]
+ __x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
+ do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x67/0xd1
+================================================================
+
+Even though fbcon_init() checks beforehand if fb_match_mode() in
+var_to_display() fails, it can not prevent the panic because fbcon_init()
+does not return error code. Considering this and the comment in the code
+about fb_match_mode() returning NULL - "This should not happen" - it is
+better to prevent registering the fb_info if its mode was not set
+successfully. Also move fb_add_videomode() closer to the beginning of
+do_register_framebuffer() to avoid having to do the cleanup on fail.
+
+Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Cc: stable@vger.kernel.org
+Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
+Signed-off-by: Helge Deller <deller@gmx.de>
+---
+ drivers/video/fbdev/core/fbmem.c | 18 +++++++++++-------
+ 1 file changed, 11 insertions(+), 7 deletions(-)
+
+--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
+@@ -388,7 +388,7 @@ static int fb_check_foreignness(struct f
+ 
+ static int do_register_framebuffer(struct fb_info *fb_info)
+ {
+-	int i;
+	int i, err = 0;
+ 	struct fb_videomode mode;
+ 
+ 	if (fb_check_foreignness(fb_info))
+@@ -397,10 +397,18 @@ static int do_register_framebuffer(struc
+ 	if (num_registered_fb == FB_MAX)
+ 		return -ENXIO;
+ 
+-	num_registered_fb++;
+ 	for (i = 0 ; i < FB_MAX; i++)
+ 		if (!registered_fb[i])
+ 			break;
+
+	if (!fb_info->modelist.prev || !fb_info->modelist.next)
+		INIT_LIST_HEAD(&fb_info->modelist);
+
+	fb_var_to_videomode(&mode, &fb_info->var);
+	err = fb_add_videomode(&mode, &fb_info->modelist);
+	if (err < 0)
+		return err;
+
+ 	fb_info->node = i;
+ 	refcount_set(&fb_info->count, 1);
+ 	mutex_init(&fb_info->lock);
+@@ -426,16 +434,12 @@ static int do_register_framebuffer(struc
+ 	if (bitmap_empty(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT))
+ 		bitmap_fill(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT);
+ 
+-	if (!fb_info->modelist.prev || !fb_info->modelist.next)
+-		INIT_LIST_HEAD(&fb_info->modelist);
+-
+ 	if (fb_info->skip_vt_switch)
+ 		pm_vt_switch_required(fb_info->device, false);
+ 	else
+ 		pm_vt_switch_required(fb_info->device, true);
+ 
+-	fb_var_to_videomode(&mode, &fb_info->var);
+-	fb_add_videomode(&mode, &fb_info->modelist);
+	num_registered_fb++;
+ 	registered_fb[i] = fb_info;
+ 
+ #ifdef CONFIG_GUMSTIX_AM200EPD
--- a/debian/patches/patchset-pf/fixes/0003-ice-mark-ice_write_prof_mask_reg-as-noinline.patch
+++ b/debian/patches/patchset-pf/fixes/0003-ice-mark-ice_write_prof_mask_reg-as-noinline.patch
@@ -1,34 +0,0 @@
-From e3d18eed972374cfbac1e58cf109209b07c1e27e Mon Sep 17 00:00:00 2001
-From: Oleksandr Natalenko <oleksandr@natalenko.name>
-Date: Tue, 8 Apr 2025 12:02:36 +0200
-Subject: ice: mark ice_write_prof_mask_reg() as noinline
-
-The following happens during build:
-
-```
-drivers/net/ethernet/intel/ice/ice.o: error: objtool: ice_free_prof_mask.isra.0() falls through to next function ice_free_flow_profs.cold()
-drivers/net/ethernet/intel/ice/ice.o: error: objtool: ice_free_prof_mask.isra.0.cold() is missing an ELF size annotation
-```
-
-Marking ice_write_prof_mask_reg() as noinline solves this, although I'm
-not sure if this is a proper solution. Apparently, this happens with -O3
-only, the `default` case is never reachable, but the optimiser generates
-branching to a random code location.
-
-Link: https://lore.kernel.org/lkml/6nzfoyak4cewjpmdflg5yi7jh2mqqdsfqgljoolx5lvdo2p65p@rwjfl7cqkfoo/
-Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
---
- drivers/net/ethernet/intel/ice/ice_flex_pipe.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
--- a/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
-+++ b/drivers/net/ethernet/intel/ice/ice_flex_pipe.c
-@@ -1404,7 +1404,7 @@ static int ice_prof_inc_ref(struct ice_h
-  * @idx: index of the FV which will use the mask
-  * @mask: the 16-bit mask
-  */
-static void
-+static noinline void
- ice_write_prof_mask_reg(struct ice_hw *hw, enum ice_block blk, u16 mask_idx,
- 			u16 idx, u16 mask)
- {
--- a/debian/patches/patchset-pf/fixes/0004-fbdev-Fix-fb_set_var-to-prevent-null-ptr-deref-in-fb.patch
+++ b/debian/patches/patchset-pf/fixes/0004-fbdev-Fix-fb_set_var-to-prevent-null-ptr-deref-in-fb.patch
@@ -0,0 +1,65 @@
+From 54c7f478f1a9d58f5609a48d461c7d495bb8301a Mon Sep 17 00:00:00 2001
+From: Murad Masimov <m.masimov@mt-integration.ru>
+Date: Mon, 28 Apr 2025 18:34:07 +0300
+Subject: fbdev: Fix fb_set_var to prevent null-ptr-deref in
+ fb_videomode_to_var
+
+If fb_add_videomode() in fb_set_var() fails to allocate memory for
+fb_videomode, later it may lead to a null-ptr dereference in
+fb_videomode_to_var(), as the fb_info is registered while not having the
+mode in modelist that is expected to be there, i.e. the one that is
+described in fb_info->var.
+
+================================================================
+general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
+KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
+CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
+RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
+Call Trace:
+ display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
+ fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
+ resize_screen drivers/tty/vt/vt.c:1176 [inline]
+ vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
+ fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
+ fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
+ do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
+ fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
+ vfs_ioctl fs/ioctl.c:48 [inline]
+ __do_sys_ioctl fs/ioctl.c:753 [inline]
+ __se_sys_ioctl fs/ioctl.c:739 [inline]
+ __x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
+ do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x67/0xd1
+================================================================
+
+The reason is that fb_info->var is being modified in fb_set_var(), and
+then fb_videomode_to_var() is called. If it fails to add the mode to
+fb_info->modelist, fb_set_var() returns error, but does not restore the
+old value of fb_info->var. Restore fb_info->var on failure the same way
+it is done earlier in the function.
+
+Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Cc: stable@vger.kernel.org
+Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
+Signed-off-by: Helge Deller <deller@gmx.de>
+---
+ drivers/video/fbdev/core/fbmem.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
+@@ -328,8 +328,10 @@ fb_set_var(struct fb_info *info, struct
+ 	    !list_empty(&info->modelist))
+ 		ret = fb_add_videomode(&mode, &info->modelist);
+ 
+-	if (ret)
+	if (ret) {
+		info->var = old_var;
+ 		return ret;
+	}
+ 
+ 	event.info = info;
+ 	event.data = &mode;
--- a/debian/patches/patchset-pf/fixes/0004-wifi-ath12k-Abort-scan-before-removing-link-interfac.patch
+++ b/debian/patches/patchset-pf/fixes/0004-wifi-ath12k-Abort-scan-before-removing-link-interfac.patch
@@ -1,40 +0,0 @@
-From e56acee381a8e07edf1920fb58f3166f911b6e5c Mon Sep 17 00:00:00 2001
-From: Lingbo Kong <quic_lingbok@quicinc.com>
-Date: Wed, 26 Feb 2025 19:31:18 +0800
-Subject: wifi: ath12k: Abort scan before removing link interface to prevent
- duplicate deletion
-
-Currently, when ath12k performs the remove link interface operation, if
-there is an ongoing scan operation on the arvif, ath12k may execute the
-remove link interface operation multiple times on the same arvif. This
-occurs because, during the remove link operation, if a scan operation is
-present on the arvif, ath12k may receive a WMI_SCAN_EVENT_COMPLETED event
-from the firmware. Upon receiving this event, ath12k will continue to
-execute the ath12k_scan_vdev_clean_work() function, performing the remove
-link interface operation on the same arvif again.
-
-To address this issue, before executing the remove link interface
-operation, ath12k needs to check if there is an ongoing scan operation on
-the current arvif. If such an operation exists, it should be aborted.
-
-Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3
-
-Signed-off-by: Lingbo Kong <quic_lingbok@quicinc.com>
---
- drivers/net/wireless/ath/ath12k/mac.c | 5 +++++
- 1 file changed, 5 insertions(+)
-
--- a/drivers/net/wireless/ath/ath12k/mac.c
-+++ b/drivers/net/wireless/ath/ath12k/mac.c
-@@ -9395,6 +9395,11 @@ ath12k_mac_op_unassign_vif_chanctx(struc
- 	    ar->num_started_vdevs == 1 && ar->monitor_vdev_created)
- 		ath12k_mac_monitor_stop(ar);
- 
-+	if (ar->scan.arvif == arvif && ar->scan.state == ATH12K_SCAN_RUNNING) {
-+		ath12k_scan_abort(ar);
-+		ar->scan.arvif = NULL;
-+	}
-+
- 	ath12k_mac_remove_link_interface(hw, arvif);
- 	ath12k_mac_unassign_link_vif(arvif);
- }
--- a/debian/patches/patchset-pf/fixes/0005-Kconfig-switch-CONFIG_SYSFS_SYCALL-default-to-n.patch
+++ b/debian/patches/patchset-pf/fixes/0005-Kconfig-switch-CONFIG_SYSFS_SYCALL-default-to-n.patch
@@ -1,49 +0,0 @@
-From 8d0e02f81d08c7b1e082028af0f55a22e7e1dfb2 Mon Sep 17 00:00:00 2001
-From: Christian Brauner <brauner@kernel.org>
-Date: Tue, 15 Apr 2025 10:22:04 +0200
-Subject: Kconfig: switch CONFIG_SYSFS_SYCALL default to n
-
-This odd system call will be removed in the future. Let's decouple it
-from CONFIG_EXPERT and switch the default to n as a first step.
-
-Signed-off-by: Christian Brauner <brauner@kernel.org>
---
- init/Kconfig | 20 ++++++++++----------
- 1 file changed, 10 insertions(+), 10 deletions(-)
-
--- a/init/Kconfig
-+++ b/init/Kconfig
-@@ -1603,6 +1603,16 @@ config SYSCTL_ARCH_UNALIGN_ALLOW
- 	  the unaligned access emulation.
- 	  see arch/parisc/kernel/unaligned.c for reference
- 
-+config SYSFS_SYSCALL
-+	bool "Sysfs syscall support"
-+	default n
-+	help
-+	  sys_sysfs is an obsolete system call no longer supported in libc.
-+	  Note that disabling this option is more secure but might break
-+	  compatibility with some systems.
-+
-+	  If unsure say N here.
-+
- config HAVE_PCSPKR_PLATFORM
- 	bool
- 
-@@ -1647,16 +1657,6 @@ config SGETMASK_SYSCALL
- 
- 	  If unsure, leave the default option here.
- 
-config SYSFS_SYSCALL
-	bool "Sysfs syscall support" if EXPERT
-	default y
-	help
-	  sys_sysfs is an obsolete system call no longer supported in libc.
-	  Note that disabling this option is more secure but might break
-	  compatibility with some systems.
-
-	  If unsure say Y here.
-
- config FHANDLE
- 	bool "open by fhandle syscalls" if EXPERT
- 	select EXPORTFS
--- a/debian/patches/patchset-pf/fixes/0005-anon_inode-use-a-proper-mode-internally.patch
+++ b/debian/patches/patchset-pf/fixes/0005-anon_inode-use-a-proper-mode-internally.patch
@@ -0,0 +1,113 @@
+From 9cb2f9d210f915aabe54c5061d84f3fbe93c71ea Mon Sep 17 00:00:00 2001
+From: Christian Brauner <brauner@kernel.org>
+Date: Mon, 7 Apr 2025 11:54:15 +0200
+Subject: anon_inode: use a proper mode internally
+
+This allows the VFS to not trip over anonymous inodes and we can add
+asserts based on the mode into the vfs. When we report it to userspace
+we can simply hide the mode to avoid regressions. I've audited all
+direct callers of alloc_anon_inode() and only secretmen overrides i_mode
+and i_op inode operations but it already uses a regular file.
+
+Link: https://lore.kernel.org/20250407-work-anon_inode-v1-1-53a44c20d44e@kernel.org
+Fixes: af153bb63a336 ("vfs: catch invalid modes in may_open()")
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Cc: stable@vger.kernel.org # all LTS kernels
+Reported-by: syzbot+5d8e79d323a13aa0b248@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/all/67ed3fb3.050a0220.14623d.0009.GAE@google.com
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+---
+ fs/anon_inodes.c | 36 ++++++++++++++++++++++++++++++++++++
+ fs/internal.h    |  3 +++
+ fs/libfs.c       |  8 +++++++-
+ 3 files changed, 46 insertions(+), 1 deletion(-)
+
+--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
+@@ -24,10 +24,44 @@
+ 
+ #include <linux/uaccess.h>
+ 
+#include "internal.h"
+
+ static struct vfsmount *anon_inode_mnt __ro_after_init;
+ static struct inode *anon_inode_inode __ro_after_init;
+ 
+ /*
+ * User space expects anonymous inodes to have no file type in st_mode.
+ *
+ * In particular, 'lsof' has this legacy logic:
+ *
+ *	type = s->st_mode & S_IFMT;
+ *	switch (type) {
+ *	  ...
+ *	case 0:
+ *		if (!strcmp(p, "anon_inode"))
+ *			Lf->ntype = Ntype = N_ANON_INODE;
+ *
+ * to detect our old anon_inode logic.
+ *
+ * Rather than mess with our internal sane inode data, just fix it
+ * up here in getattr() by masking off the format bits.
+ */
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+		       struct kstat *stat, u32 request_mask,
+		       unsigned int query_flags)
+{
+	struct inode *inode = d_inode(path->dentry);
+
+	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+	stat->mode &= ~S_IFMT;
+	return 0;
+}
+
+static const struct inode_operations anon_inode_operations = {
+	.getattr = anon_inode_getattr,
+};
+
+/*
+  * anon_inodefs_dname() is called from d_path().
+  */
+ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
+@@ -66,6 +100,7 @@ static struct inode *anon_inode_make_sec
+ 	if (IS_ERR(inode))
+ 		return inode;
+ 	inode->i_flags &= ~S_PRIVATE;
+	inode->i_op = &anon_inode_operations;
+ 	error =	security_inode_init_security_anon(inode, &QSTR(name),
+ 						  context_inode);
+ 	if (error) {
+@@ -313,6 +348,7 @@ static int __init anon_inode_init(void)
+ 	anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+ 	if (IS_ERR(anon_inode_inode))
+ 		panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
+	anon_inode_inode->i_op = &anon_inode_operations;
+ 
+ 	return 0;
+ }
+--- a/fs/internal.h
+++ b/fs/internal.h
+@@ -343,3 +343,6 @@ static inline bool path_mounted(const st
+ void file_f_owner_release(struct file *file);
+ bool file_seek_cur_needs_f_lock(struct file *file);
+ int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+		       struct kstat *stat, u32 request_mask,
+		       unsigned int query_flags);
+--- a/fs/libfs.c
+++ b/fs/libfs.c
+@@ -1647,7 +1647,13 @@ struct inode *alloc_anon_inode(struct su
+ 	 * that it already _is_ on the dirty list.
+ 	 */
+ 	inode->i_state = I_DIRTY;
+-	inode->i_mode = S_IRUSR | S_IWUSR;
+	/*
+	 * Historically anonymous inodes didn't have a type at all and
+	 * userspace has come to rely on this. Internally they're just
+	 * regular files but S_IFREG is masked off when reporting
+	 * information to userspace.
+	 */
+	inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+ 	inode->i_uid = current_fsuid();
+ 	inode->i_gid = current_fsgid();
+ 	inode->i_flags |= S_PRIVATE;
--- a/debian/patches/patchset-pf/fixes/0006-anon_inode-explicitly-block-setattr.patch
+++ b/debian/patches/patchset-pf/fixes/0006-anon_inode-explicitly-block-setattr.patch
@@ -0,0 +1,80 @@
+From ea4199112ae6d8da866417f50e035be01488c502 Mon Sep 17 00:00:00 2001
+From: Christian Brauner <brauner@kernel.org>
+Date: Mon, 7 Apr 2025 11:54:17 +0200
+Subject: anon_inode: explicitly block ->setattr()
+
+It is currently possible to change the mode and owner of the single
+anonymous inode in the kernel:
+
+int main(int argc, char *argv[])
+{
+        int ret, sfd;
+        sigset_t mask;
+        struct signalfd_siginfo fdsi;
+
+        sigemptyset(&mask);
+        sigaddset(&mask, SIGINT);
+        sigaddset(&mask, SIGQUIT);
+
+        ret = sigprocmask(SIG_BLOCK, &mask, NULL);
+        if (ret < 0)
+                _exit(1);
+
+        sfd = signalfd(-1, &mask, 0);
+        if (sfd < 0)
+                _exit(2);
+
+        ret = fchown(sfd, 5555, 5555);
+        if (ret < 0)
+                _exit(3);
+
+        ret = fchmod(sfd, 0777);
+        if (ret < 0)
+                _exit(3);
+
+        _exit(4);
+}
+
+This is a bug. It's not really a meaningful one because anonymous inodes
+don't really figure into path lookup and they cannot be reopened via
+/proc/<pid>/fd/<nr> and can't be used for lookup itself. So they can
+only ever serve as direct references.
+
+But it is still completely bogus to allow the mode and ownership or any
+of the properties of the anonymous inode to be changed. Block this!
+
+Link: https://lore.kernel.org/20250407-work-anon_inode-v1-3-53a44c20d44e@kernel.org
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Cc: stable@vger.kernel.org # all LTS kernels
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+---
+ fs/anon_inodes.c | 7 +++++++
+ fs/internal.h    | 2 ++
+ 2 files changed, 9 insertions(+)
+
+--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
+@@ -57,8 +57,15 @@ int anon_inode_getattr(struct mnt_idmap
+ 	return 0;
+ }
+ 
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+		       struct iattr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+ static const struct inode_operations anon_inode_operations = {
+ 	.getattr = anon_inode_getattr,
+	.setattr = anon_inode_setattr,
+ };
+ 
+ /*
+--- a/fs/internal.h
+++ b/fs/internal.h
+@@ -346,3 +346,5 @@ int statmount_mnt_idmap(struct mnt_idmap
+ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ 		       struct kstat *stat, u32 request_mask,
+ 		       unsigned int query_flags);
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+		       struct iattr *attr);
--- a/debian/patches/patchset-pf/fixes/0006-wifi-mac80211-mark-copy_mesh_setup-as-noinline.patch
+++ b/debian/patches/patchset-pf/fixes/0006-wifi-mac80211-mark-copy_mesh_setup-as-noinline.patch
@@ -1,35 +0,0 @@
-From f762c206076d274ecb0e2f3d9b6cbca361ebb246 Mon Sep 17 00:00:00 2001
-From: Oleksandr Natalenko <oleksandr@natalenko.name>
-Date: Thu, 1 May 2025 20:22:53 +0200
-Subject: wifi: mac80211: mark copy_mesh_setup() as noinline
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-With -O3 and GCC v15.1, the following happens:
-
-```
-In function ‘fortify_memcpy_chk’,
-    inlined from ‘copy_mesh_setup’ at net/mac80211/cfg.c:2541:2,
-    inlined from ‘ieee80211_join_mesh’ at net/mac80211/cfg.c:2694:8:
-./include/linux/fortify-string.h:571:25: warning: call to ‘__write_overflow_field’ declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning]
-```
-
-Maybe, it's time to abandon -O3 altogether?
-
-Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
---
- net/mac80211/cfg.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
--- a/net/mac80211/cfg.c
-+++ b/net/mac80211/cfg.c
-@@ -2501,7 +2501,7 @@ static inline bool _chg_mesh_attr(enum n
- 	return (mask >> (parm-1)) & 0x1;
- }
- 
-static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
-+static noinline int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
- 		const struct mesh_setup *setup)
- {
- 	u8 *new_ie;
--- a/debian/patches/patchset-pf/fixes/0007-anon_inode-raise-SB_I_NODEV-and-SB_I_NOEXEC.patch
+++ b/debian/patches/patchset-pf/fixes/0007-anon_inode-raise-SB_I_NODEV-and-SB_I_NOEXEC.patch
@@ -0,0 +1,39 @@
+From 79f54c5bc7c6097a379c83e9ed56bee27cf1218a Mon Sep 17 00:00:00 2001
+From: Christian Brauner <brauner@kernel.org>
+Date: Mon, 7 Apr 2025 11:54:19 +0200
+Subject: anon_inode: raise SB_I_NODEV and SB_I_NOEXEC
+
+It isn't possible to execute anonymous inodes because they cannot be
+opened in any way after they have been created. This includes execution:
+
+execveat(fd_anon_inode, "", NULL, NULL, AT_EMPTY_PATH)
+
+Anonymous inodes have inode->f_op set to no_open_fops which sets
+no_open() which returns ENXIO. That means any call to do_dentry_open()
+which is the endpoint of the do_open_execat() will fail. There's no
+chance to execute an anonymous inode. Unless a given subsystem overrides
+it ofc.
+
+However, we should still harden this and raise SB_I_NODEV and
+SB_I_NOEXEC on the superblock itself so that no one gets any creative
+ideas.
+
+Link: https://lore.kernel.org/20250407-work-anon_inode-v1-5-53a44c20d44e@kernel.org
+Reviewed-by: Jeff Layton <jlayton@kernel.org>
+Cc: stable@vger.kernel.org # all LTS kernels
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+---
+ fs/anon_inodes.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
+@@ -86,6 +86,8 @@ static int anon_inodefs_init_fs_context(
+ 	struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
+ 	if (!ctx)
+ 		return -ENOMEM;
+	fc->s_iflags |= SB_I_NOEXEC;
+	fc->s_iflags |= SB_I_NODEV;
+ 	ctx->dops = &anon_inodefs_dentry_operations;
+ 	return 0;
+ }
--- a/debian/patches/patchset-pf/fixes/0008-fs-add-S_ANON_INODE.patch
+++ b/debian/patches/patchset-pf/fixes/0008-fs-add-S_ANON_INODE.patch
@@ -0,0 +1,136 @@
+From edaacbee0f33b7371ec460723d1042a6c5a4bb9d Mon Sep 17 00:00:00 2001
+From: Christian Brauner <brauner@kernel.org>
+Date: Mon, 21 Apr 2025 10:27:40 +0200
+Subject: fs: add S_ANON_INODE
+
+This makes it easy to detect proper anonymous inodes and to ensure that
+we can detect them in codepaths such as readahead().
+
+Readahead on anonymous inodes didn't work because they didn't have a
+proper mode. Now that they have we need to retain EINVAL being returned
+otherwise LTP will fail.
+
+We also need to ensure that ioctls aren't simply fired like they are for
+regular files so things like inotify inodes continue to correctly call
+their own ioctl handlers as in [1].
+
+Reported-by: Xilin Wu <sophon@radxa.com>
+Link: https://lore.kernel.org/3A9139D5CD543962+89831381-31b9-4392-87ec-a84a5b3507d8@radxa.com [1]
+Link: https://lore.kernel.org/7a1a7076-ff6b-4cb0-94e7-7218a0a44028@sirena.org.uk
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+---
+ fs/ioctl.c         |  7 ++++---
+ fs/libfs.c         |  2 +-
+ fs/pidfs.c         |  2 +-
+ include/linux/fs.h |  2 ++
+ mm/readahead.c     | 20 ++++++++++++++++----
+ 5 files changed, 24 insertions(+), 9 deletions(-)
+
+--- a/fs/ioctl.c
+++ b/fs/ioctl.c
+@@ -821,7 +821,8 @@ static int do_vfs_ioctl(struct file *fil
+ 		return ioctl_fioasync(fd, filp, argp);
+ 
+ 	case FIOQSIZE:
+-		if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
+		if (S_ISDIR(inode->i_mode) ||
+		    (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) ||
+ 		    S_ISLNK(inode->i_mode)) {
+ 			loff_t res = inode_get_bytes(inode);
+ 			return copy_to_user(argp, &res, sizeof(res)) ?
+@@ -856,7 +857,7 @@ static int do_vfs_ioctl(struct file *fil
+ 		return ioctl_file_dedupe_range(filp, argp);
+ 
+ 	case FIONREAD:
+-		if (!S_ISREG(inode->i_mode))
+		if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode))
+ 			return vfs_ioctl(filp, cmd, arg);
+ 
+ 		return put_user(i_size_read(inode) - filp->f_pos,
+@@ -881,7 +882,7 @@ static int do_vfs_ioctl(struct file *fil
+ 		return ioctl_get_fs_sysfs_path(filp, argp);
+ 
+ 	default:
+-		if (S_ISREG(inode->i_mode))
+		if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode))
+ 			return file_ioctl(filp, cmd, argp);
+ 		break;
+ 	}
+--- a/fs/libfs.c
+++ b/fs/libfs.c
+@@ -1656,7 +1656,7 @@ struct inode *alloc_anon_inode(struct su
+ 	inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+ 	inode->i_uid = current_fsuid();
+ 	inode->i_gid = current_fsgid();
+-	inode->i_flags |= S_PRIVATE;
+	inode->i_flags |= S_PRIVATE | S_ANON_INODE;
+ 	simple_inode_init_ts(inode);
+ 	return inode;
+ }
+--- a/fs/pidfs.c
+++ b/fs/pidfs.c
+@@ -826,7 +826,7 @@ static int pidfs_init_inode(struct inode
+ 	const struct pid *pid = data;
+ 
+ 	inode->i_private = data;
+-	inode->i_flags |= S_PRIVATE;
+	inode->i_flags |= S_PRIVATE | S_ANON_INODE;
+ 	inode->i_mode |= S_IRWXU;
+ 	inode->i_op = &pidfs_inode_operations;
+ 	inode->i_fop = &pidfs_file_operations;
+--- a/include/linux/fs.h
+++ b/include/linux/fs.h
+@@ -2344,6 +2344,7 @@ struct super_operations {
+ #define S_CASEFOLD	(1 << 15) /* Casefolded file */
+ #define S_VERITY	(1 << 16) /* Verity file (using fs/verity/) */
+ #define S_KERNEL_FILE	(1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
+#define S_ANON_INODE	(1 << 19) /* Inode is an anonymous inode */
+ 
+ /*
+  * Note that nosuid etc flags are inode-specific: setting some file-system
+@@ -2400,6 +2401,7 @@ static inline bool sb_rdonly(const struc
+ 
+ #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
+ 				 (inode)->i_rdev == WHITEOUT_DEV)
+#define IS_ANON_FILE(inode)	((inode)->i_flags & S_ANON_INODE)
+ 
+ static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
+ 				   struct inode *inode)
+--- a/mm/readahead.c
+++ b/mm/readahead.c
+@@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
+ 
+ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
+ {
+	struct file *file;
+	const struct inode *inode;
+
+ 	CLASS(fd, f)(fd);
+	if (fd_empty(f))
+		return -EBADF;
+ 
+-	if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+	file = fd_file(f);
+	if (!(file->f_mode & FMODE_READ))
+ 		return -EBADF;
+ 
+ 	/*
+@@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t of
+ 	 * that can execute readahead. If readahead is not possible
+ 	 * on this file, then we must return -EINVAL.
+ 	 */
+-	if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
+-	    (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
+-	    !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+	if (!file->f_mapping)
+		return -EINVAL;
+	if (!file->f_mapping->a_ops)
+		return -EINVAL;
+
+	inode = file_inode(file);
+	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+		return -EINVAL;
+	if (IS_ANON_FILE(inode))
+ 		return -EINVAL;
+ 
+ 	return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
--- a/debian/patches/patchset-pf/fixes/0009-configfs-Do-not-override-creating-attribute-file-fai.patch
+++ b/debian/patches/patchset-pf/fixes/0009-configfs-Do-not-override-creating-attribute-file-fai.patch
@@ -0,0 +1,35 @@
+From ab287d709809b6dfe4d3c42016a543d976533d51 Mon Sep 17 00:00:00 2001
+From: Zijun Hu <quic_zijuhu@quicinc.com>
+Date: Wed, 7 May 2025 19:50:26 +0800
+Subject: configfs: Do not override creating attribute file failure in
+ populate_attrs()
+
+populate_attrs() may override failure for creating attribute files
+by success for creating subsequent bin attribute files, and have
+wrong return value.
+
+Fix by creating bin attribute files under successfully creating
+attribute files.
+
+Fixes: 03607ace807b ("configfs: implement binary attributes")
+Cc: stable@vger.kernel.org
+Reviewed-by: Joel Becker <jlbec@evilplan.org>
+Reviewed-by: Breno Leitao <leitao@debian.org>
+Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
+Link: https://lore.kernel.org/r/20250507-fix_configfs-v3-2-fe2d96de8dc4@quicinc.com
+Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
+---
+ fs/configfs/dir.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
+@@ -619,7 +619,7 @@ static int populate_attrs(struct config_
+ 				break;
+ 		}
+ 	}
+-	if (t->ct_bin_attrs) {
+	if (!error && t->ct_bin_attrs) {
+ 		for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
+ 			if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i))
+ 				continue;
--- a/debian/patches/patchset-pf/fixes/0010-Don-t-propagate-mounts-into-detached-trees.patch
+++ b/debian/patches/patchset-pf/fixes/0010-Don-t-propagate-mounts-into-detached-trees.patch
@@ -0,0 +1,104 @@
+From 896b7b0d6ed53a7fe159c4b76f25407c816aa619 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Fri, 23 May 2025 19:20:36 -0400
+Subject: Don't propagate mounts into detached trees
+
+All versions up to 6.14 did not propagate mount events into detached
+tree.  Shortly after 6.14 a merge of vfs-6.15-rc1.mount.namespace
+(130e696aa68b) has changed that.
+
+Unfortunately, that has caused userland regressions (reported in
+https://lore.kernel.org/all/CAOYeF9WQhFDe+BGW=Dp5fK8oRy5AgZ6zokVyTj1Wp4EUiYgt4w@mail.gmail.com/)
+
+Straight revert wouldn't be an option - in particular, the variant in 6.14
+had a bug that got fixed in d1ddc6f1d9f0 ("fix IS_MNT_PROPAGATING uses")
+and we don't want to bring the bug back.
+
+This is a modification of manual revert posted by Christian, with changes
+needed to avoid reintroducing the breakage in scenario described in
+d1ddc6f1d9f0.
+
+Cc: stable@vger.kernel.org
+Reported-by: Allison Karlitskaya <lis@redhat.com>
+Tested-by: Allison Karlitskaya <lis@redhat.com>
+Acked-by: Christian Brauner <brauner@kernel.org>
+Co-developed-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+---
+ fs/mount.h     |  5 -----
+ fs/namespace.c | 15 ++-------------
+ fs/pnode.c     |  4 ++--
+ 3 files changed, 4 insertions(+), 20 deletions(-)
+
+--- a/fs/mount.h
+++ b/fs/mount.h
+@@ -7,10 +7,6 @@
+ 
+ extern struct list_head notify_list;
+ 
+-typedef __u32 __bitwise mntns_flags_t;
+-
+-#define MNTNS_PROPAGATING	((__force mntns_flags_t)(1 << 0))
+-
+ struct mnt_namespace {
+ 	struct ns_common	ns;
+ 	struct mount *	root;
+@@ -37,7 +33,6 @@ struct mnt_namespace {
+ 	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
+ 	struct list_head	mnt_ns_list; /* entry in the sequential list of mounts namespace */
+ 	refcount_t		passive; /* number references not pinning @mounts */
+-	mntns_flags_t		mntns_flags;
+ } __randomize_layout;
+ 
+ struct mnt_pcp {
+--- a/fs/namespace.c
+++ b/fs/namespace.c
+@@ -3648,7 +3648,7 @@ static int do_move_mount(struct path *ol
+ 	if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
+ 		goto out;
+ 
+-	if (is_anon_ns(ns)) {
+	if (is_anon_ns(ns) && ns == p->mnt_ns) {
+ 		/*
+ 		 * Ending up with two files referring to the root of the
+ 		 * same anonymous mount namespace would cause an error
+@@ -3656,16 +3656,7 @@ static int do_move_mount(struct path *ol
+ 		 * twice into the mount tree which would be rejected
+ 		 * later. But be explicit about it right here.
+ 		 */
+-		if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns))
+-			goto out;
+-
+-		/*
+-		 * If this is an anonymous mount tree ensure that mount
+-		 * propagation can detect mounts that were just
+-		 * propagated to the target mount tree so we don't
+-		 * propagate onto them.
+-		 */
+-		ns->mntns_flags |= MNTNS_PROPAGATING;
+		goto out;
+ 	} else if (is_anon_ns(p->mnt_ns)) {
+ 		/*
+ 		 * Don't allow moving an attached mount tree to an
+@@ -3722,8 +3713,6 @@ static int do_move_mount(struct path *ol
+ 	if (attached)
+ 		put_mountpoint(old_mp);
+ out:
+-	if (is_anon_ns(ns))
+-		ns->mntns_flags &= ~MNTNS_PROPAGATING;
+ 	unlock_mount(mp);
+ 	if (!err) {
+ 		if (attached) {
+--- a/fs/pnode.c
+++ b/fs/pnode.c
+@@ -231,8 +231,8 @@ static int propagate_one(struct mount *m
+ 	/* skip if mountpoint isn't visible in m */
+ 	if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
+ 		return 0;
+-	/* skip if m is in the anon_ns we are emptying */
+-	if (m->mnt_ns->mntns_flags & MNTNS_PROPAGATING)
+	/* skip if m is in the anon_ns */
+	if (is_anon_ns(m->mnt_ns))
+ 		return 0;
+ 
+ 	if (peers(m, last_dest)) {
--- a/debian/patches/patchset-pf/fixes/0011-mm-filemap-gate-dropbehind-invalidate-on-folio-dirty.patch
+++ b/debian/patches/patchset-pf/fixes/0011-mm-filemap-gate-dropbehind-invalidate-on-folio-dirty.patch
@@ -0,0 +1,51 @@
+From bc86aaf0e0256220ca787fdbb57a73429ade1129 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 27 May 2025 07:28:52 -0600
+Subject: mm/filemap: gate dropbehind invalidate on folio !dirty && !writeback
+
+It's possible for the folio to either get marked for writeback or
+redirtied. Add a helper, filemap_end_dropbehind(), which guards the
+folio_unmap_invalidate() call behind check for the folio being both
+non-dirty and not under writeback AFTER the folio lock has been
+acquired. Use this helper folio_end_dropbehind_write().
+
+Cc: stable@vger.kernel.org
+Reported-by: Al Viro <viro@zeniv.linux.org.uk>
+Fixes: fb7d3bc41493 ("mm/filemap: drop streaming/uncached pages when writeback completes")
+Link: https://lore.kernel.org/linux-fsdevel/20250525083209.GS2023217@ZenIV/
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/20250527133255.452431-2-axboe@kernel.dk
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+---
+ mm/filemap.c | 13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/mm/filemap.c
+++ b/mm/filemap.c
+@@ -1589,6 +1589,16 @@ int folio_wait_private_2_killable(struct
+ }
+ EXPORT_SYMBOL(folio_wait_private_2_killable);
+ 
+static void filemap_end_dropbehind(struct folio *folio)
+{
+	struct address_space *mapping = folio->mapping;
+
+	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+	if (mapping && !folio_test_writeback(folio) && !folio_test_dirty(folio))
+		folio_unmap_invalidate(mapping, folio, 0);
+}
+
+ /*
+  * If folio was marked as dropbehind, then pages should be dropped when writeback
+  * completes. Do that now. If we fail, it's likely because of a big folio -
+@@ -1604,8 +1614,7 @@ static void folio_end_dropbehind_write(s
+ 	 * invalidation in that case.
+ 	 */
+ 	if (in_task() && folio_trylock(folio)) {
+-		if (folio->mapping)
+-			folio_unmap_invalidate(folio->mapping, folio, 0);
+		filemap_end_dropbehind(folio);
+ 		folio_unlock(folio);
+ 	}
+ }
--- a/debian/patches/patchset-pf/fixes/0012-mm-filemap-use-filemap_end_dropbehind-for-read-inval.patch
+++ b/debian/patches/patchset-pf/fixes/0012-mm-filemap-use-filemap_end_dropbehind-for-read-inval.patch
@@ -0,0 +1,51 @@
+From fad76185ca91983990c660642151083eb05cbfc0 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 27 May 2025 07:28:53 -0600
+Subject: mm/filemap: use filemap_end_dropbehind() for read invalidation
+
+Use the filemap_end_dropbehind() helper rather than calling
+folio_unmap_invalidate() directly, as we need to check if the folio has
+been redirtied or marked for writeback once the folio lock has been
+re-acquired.
+
+Cc: stable@vger.kernel.org
+Reported-by: Trond Myklebust <trondmy@hammerspace.com>
+Fixes: 8026e49bff9b ("mm/filemap: add read support for RWF_DONTCACHE")
+Link: https://lore.kernel.org/linux-fsdevel/ba8a9805331ce258a622feaca266b163db681a10.camel@hammerspace.com/
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/20250527133255.452431-3-axboe@kernel.dk
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+---
+ mm/filemap.c | 7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/mm/filemap.c
+++ b/mm/filemap.c
+@@ -2644,8 +2644,7 @@ static inline bool pos_same_folio(loff_t
+ 	return (pos1 >> shift == pos2 >> shift);
+ }
+ 
+-static void filemap_end_dropbehind_read(struct address_space *mapping,
+-					struct folio *folio)
+static void filemap_end_dropbehind_read(struct folio *folio)
+ {
+ 	if (!folio_test_dropbehind(folio))
+ 		return;
+@@ -2653,7 +2652,7 @@ static void filemap_end_dropbehind_read(
+ 		return;
+ 	if (folio_trylock(folio)) {
+ 		if (folio_test_clear_dropbehind(folio))
+-			folio_unmap_invalidate(mapping, folio, 0);
+			filemap_end_dropbehind(folio);
+ 		folio_unlock(folio);
+ 	}
+ }
+@@ -2774,7 +2773,7 @@ put_folios:
+ 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ 			struct folio *folio = fbatch.folios[i];
+ 
+-			filemap_end_dropbehind_read(mapping, folio);
+			filemap_end_dropbehind_read(folio);
+ 			folio_put(folio);
+ 		}
+ 		folio_batch_init(&fbatch);
--- a/debian/patches/patchset-pf/fixes/0013-Revert-Disable-FOP_DONTCACHE-for-now-due-to-bugs.patch
+++ b/debian/patches/patchset-pf/fixes/0013-Revert-Disable-FOP_DONTCACHE-for-now-due-to-bugs.patch
@@ -0,0 +1,29 @@
+From f0579d45f2e03fa3ba0d9466e79a31ea37acb487 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 27 May 2025 07:28:54 -0600
+Subject: Revert "Disable FOP_DONTCACHE for now due to bugs"
+
+This reverts commit 478ad02d6844217cc7568619aeb0809d93ade43d.
+
+Both the read and write side dirty && writeback races should be resolved
+now, revert the commit that disabled FOP_DONTCACHE for filesystems.
+
+Link: https://lore.kernel.org/linux-fsdevel/20250525083209.GS2023217@ZenIV/
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/20250527133255.452431-4-axboe@kernel.dk
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+---
+ include/linux/fs.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/linux/fs.h
+++ b/include/linux/fs.h
+@@ -2186,7 +2186,7 @@ struct file_operations {
+ /* Supports asynchronous lock callbacks */
+ #define FOP_ASYNC_LOCK		((__force fop_flags_t)(1 << 6))
+ /* File system supports uncached read/write buffered IO */
+-#define FOP_DONTCACHE		0 /* ((__force fop_flags_t)(1 << 7)) */
+#define FOP_DONTCACHE		((__force fop_flags_t)(1 << 7))
+ 
+ /* Wrap a directory iterator that needs exclusive inode access */
+ int wrap_directory_iterator(struct file *, struct dir_context *,
--- a/debian/patches/patchset-pf/fixes/0014-mm-filemap-unify-read-write-dropbehind-naming.patch
+++ b/debian/patches/patchset-pf/fixes/0014-mm-filemap-unify-read-write-dropbehind-naming.patch
@@ -0,0 +1,36 @@
+From 3b4614564770691cf3a6eb88127268ef6a84180c Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 27 May 2025 07:28:55 -0600
+Subject: mm/filemap: unify read/write dropbehind naming
+
+The read side is filemap_end_dropbehind_read(), while the write side
+used folio_ as the prefix rather than filemap_. The read side makes more
+sense, unify the naming such that the write side follows that.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/20250527133255.452431-5-axboe@kernel.dk
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+---
+ mm/filemap.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/mm/filemap.c
+++ b/mm/filemap.c
+@@ -1604,7 +1604,7 @@ static void filemap_end_dropbehind(struc
+  * completes. Do that now. If we fail, it's likely because of a big folio -
+  * just reset dropbehind for that case and latter completions should invalidate.
+  */
+-static void folio_end_dropbehind_write(struct folio *folio)
+static void filemap_end_dropbehind_write(struct folio *folio)
+ {
+ 	/*
+ 	 * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
+@@ -1659,7 +1659,7 @@ void folio_end_writeback(struct folio *f
+ 	acct_reclaim_writeback(folio);
+ 
+ 	if (folio_dropbehind)
+-		folio_end_dropbehind_write(folio);
+		filemap_end_dropbehind_write(folio);
+ 	folio_put(folio);
+ }
+ EXPORT_SYMBOL(folio_end_writeback);
--- a/debian/patches/patchset-pf/fixes/0015-mm-filemap-unify-dropbehind-flag-testing-and-clearin.patch
+++ b/debian/patches/patchset-pf/fixes/0015-mm-filemap-unify-dropbehind-flag-testing-and-clearin.patch
@@ -0,0 +1,78 @@
+From 6003153e1bc4ad4952773081d7b89aa1ab2274c3 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 27 May 2025 07:28:56 -0600
+Subject: mm/filemap: unify dropbehind flag testing and clearing
+
+The read and write side does this a bit differently, unify it such that
+the _{read,write} helpers check the bit before locking, and the generic
+handler is in charge of clearing the bit and invalidating, once under
+the folio lock.
+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/20250527133255.452431-6-axboe@kernel.dk
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+---
+ mm/filemap.c | 21 +++++++++++----------
+ 1 file changed, 11 insertions(+), 10 deletions(-)
+
+--- a/mm/filemap.c
+++ b/mm/filemap.c
+@@ -1595,7 +1595,11 @@ static void filemap_end_dropbehind(struc
+ 
+ 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ 
+-	if (mapping && !folio_test_writeback(folio) && !folio_test_dirty(folio))
+	if (folio_test_writeback(folio) || folio_test_dirty(folio))
+		return;
+	if (!folio_test_clear_dropbehind(folio))
+		return;
+	if (mapping)
+ 		folio_unmap_invalidate(mapping, folio, 0);
+ }
+ 
+@@ -1606,6 +1610,9 @@ static void filemap_end_dropbehind(struc
+  */
+ static void filemap_end_dropbehind_write(struct folio *folio)
+ {
+	if (!folio_test_dropbehind(folio))
+		return;
+
+ 	/*
+ 	 * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
+ 	 * but can happen if normal writeback just happens to find dirty folios
+@@ -1629,8 +1636,6 @@ static void filemap_end_dropbehind_write
+  */
+ void folio_end_writeback(struct folio *folio)
+ {
+-	bool folio_dropbehind = false;
+-
+ 	VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
+ 
+ 	/*
+@@ -1652,14 +1657,11 @@ void folio_end_writeback(struct folio *f
+ 	 * reused before the folio_wake_bit().
+ 	 */
+ 	folio_get(folio);
+-	if (!folio_test_dirty(folio))
+-		folio_dropbehind = folio_test_clear_dropbehind(folio);
+ 	if (__folio_end_writeback(folio))
+ 		folio_wake_bit(folio, PG_writeback);
+-	acct_reclaim_writeback(folio);
+ 
+-	if (folio_dropbehind)
+-		filemap_end_dropbehind_write(folio);
+	filemap_end_dropbehind_write(folio);
+	acct_reclaim_writeback(folio);
+ 	folio_put(folio);
+ }
+ EXPORT_SYMBOL(folio_end_writeback);
+@@ -2651,8 +2653,7 @@ static void filemap_end_dropbehind_read(
+ 	if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ 		return;
+ 	if (folio_trylock(folio)) {
+-		if (folio_test_clear_dropbehind(folio))
+-			filemap_end_dropbehind(folio);
+		filemap_end_dropbehind(folio);
+ 		folio_unlock(folio);
+ 	}
+ }
--- a/debian/patches/patchset-pf/fixes/0016-mm-khugepaged-fix-race-with-folio-split-free-using-t.patch
+++ b/debian/patches/patchset-pf/fixes/0016-mm-khugepaged-fix-race-with-folio-split-free-using-t.patch
@@ -0,0 +1,98 @@
+From 61c0b2450f2b85c5053fa4f71d9c619b34d3af6c Mon Sep 17 00:00:00 2001
+From: Shivank Garg <shivankg@amd.com>
+Date: Mon, 26 May 2025 18:28:18 +0000
+Subject: mm/khugepaged: fix race with folio split/free using temporary
+ reference
+
+hpage_collapse_scan_file() calls is_refcount_suitable(), which in turn
+calls folio_mapcount().  folio_mapcount() checks folio_test_large() before
+proceeding to folio_large_mapcount(), but there is a race window where the
+folio may get split/freed between these checks, triggering:
+
+  VM_WARN_ON_FOLIO(!folio_test_large(folio), folio)
+
+Take a temporary reference to the folio in hpage_collapse_scan_file().
+This stabilizes the folio during refcount check and prevents incorrect
+large folio detection due to concurrent split/free.  Use helper
+folio_expected_ref_count() + 1 to compare with folio_ref_count() instead
+of using is_refcount_suitable().
+
+Link: https://lkml.kernel.org/r/20250526182818.37978-1-shivankg@amd.com
+Fixes: 05c5323b2a34 ("mm: track mapcount of large folios in single value")
+Signed-off-by: Shivank Garg <shivankg@amd.com>
+Reported-by: syzbot+2b99589e33edbe9475ca@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/all/6828470d.a70a0220.38f255.000c.GAE@google.com
+Suggested-by: David Hildenbrand <david@redhat.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Dev Jain <dev.jain@arm.com>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Bharata B Rao <bharata@amd.com>
+Cc: Fengwei Yin <fengwei.yin@intel.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Mariano Pache <npache@redhat.com>
+Cc: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ mm/khugepaged.c | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
+@@ -2295,6 +2295,17 @@ static int hpage_collapse_scan_file(stru
+ 			continue;
+ 		}
+ 
+		if (!folio_try_get(folio)) {
+			xas_reset(&xas);
+			continue;
+		}
+
+		if (unlikely(folio != xas_reload(&xas))) {
+			folio_put(folio);
+			xas_reset(&xas);
+			continue;
+		}
+
+ 		if (folio_order(folio) == HPAGE_PMD_ORDER &&
+ 		    folio->index == start) {
+ 			/* Maybe PMD-mapped */
+@@ -2305,23 +2316,27 @@ static int hpage_collapse_scan_file(stru
+ 			 * it's safe to skip LRU and refcount checks before
+ 			 * returning.
+ 			 */
+			folio_put(folio);
+ 			break;
+ 		}
+ 
+ 		node = folio_nid(folio);
+ 		if (hpage_collapse_scan_abort(node, cc)) {
+ 			result = SCAN_SCAN_ABORT;
+			folio_put(folio);
+ 			break;
+ 		}
+ 		cc->node_load[node]++;
+ 
+ 		if (!folio_test_lru(folio)) {
+ 			result = SCAN_PAGE_LRU;
+			folio_put(folio);
+ 			break;
+ 		}
+ 
+-		if (!is_refcount_suitable(folio)) {
+		if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
+ 			result = SCAN_PAGE_COUNT;
+			folio_put(folio);
+ 			break;
+ 		}
+ 
+@@ -2333,6 +2348,7 @@ static int hpage_collapse_scan_file(stru
+ 		 */
+ 
+ 		present += folio_nr_pages(folio);
+		folio_put(folio);
+ 
+ 		if (need_resched()) {
+ 			xas_pause(&xas);
--- a/debian/patches/patchset-pf/fixes/0017-mm-add-folio_expected_ref_count-for-reference-count-.patch
+++ b/debian/patches/patchset-pf/fixes/0017-mm-add-folio_expected_ref_count-for-reference-count-.patch
@@ -0,0 +1,198 @@
+From 214092002cbd9945b7cc6314e76ec42b3f588c01 Mon Sep 17 00:00:00 2001
+From: Shivank Garg <shivankg@amd.com>
+Date: Wed, 30 Apr 2025 10:01:51 +0000
+Subject: mm: add folio_expected_ref_count() for reference count calculation
+
+Patch series " JFS: Implement migrate_folio for jfs_metapage_aops" v5.
+
+This patchset addresses a warning that occurs during memory compaction due
+to JFS's missing migrate_folio operation.  The warning was introduced by
+commit 7ee3647243e5 ("migrate: Remove call to ->writepage") which added
+explicit warnings when filesystem don't implement migrate_folio.
+
+The syzbot reported following [1]:
+  jfs_metapage_aops does not implement migrate_folio
+  WARNING: CPU: 1 PID: 5861 at mm/migrate.c:955 fallback_migrate_folio mm/migrate.c:953 [inline]
+  WARNING: CPU: 1 PID: 5861 at mm/migrate.c:955 move_to_new_folio+0x70e/0x840 mm/migrate.c:1007
+  Modules linked in:
+  CPU: 1 UID: 0 PID: 5861 Comm: syz-executor280 Not tainted 6.15.0-rc1-next-20250411-syzkaller #0 PREEMPT(full)
+  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025
+  RIP: 0010:fallback_migrate_folio mm/migrate.c:953 [inline]
+  RIP: 0010:move_to_new_folio+0x70e/0x840 mm/migrate.c:1007
+
+To fix this issue, this series implement metapage_migrate_folio() for JFS
+which handles both single and multiple metapages per page configurations.
+
+While most filesystems leverage existing migration implementations like
+filemap_migrate_folio(), buffer_migrate_folio_norefs() or
+buffer_migrate_folio() (which internally used folio_expected_refs()),
+JFS's metapage architecture requires special handling of its private data
+during migration.  To support this, this series introduce the
+folio_expected_ref_count(), which calculates external references to a
+folio from page/swap cache, private data, and page table mappings.
+
+This standardized implementation replaces the previous ad-hoc
+folio_expected_refs() function and enables JFS to accurately determine
+whether a folio has unexpected references before attempting migration.
+
+
+
+
+Implement folio_expected_ref_count() to calculate expected folio reference
+counts from:
+- Page/swap cache (1 per page)
+- Private data (1)
+- Page table mappings (1 per map)
+
+While originally needed for page migration operations, this improved
+implementation standardizes reference counting by consolidating all
+refcount contributors into a single, reusable function that can benefit
+any subsystem needing to detect unexpected references to folios.
+
+The folio_expected_ref_count() returns the sum of these external
+references without including any reference the caller itself might hold.
+Callers comparing against the actual folio_ref_count() must account for
+their own references separately.
+
+Link: https://syzkaller.appspot.com/bug?extid=8bb6fd945af4e0ad9299 [1]
+Link: https://lkml.kernel.org/r/20250430100150.279751-1-shivankg@amd.com
+Link: https://lkml.kernel.org/r/20250430100150.279751-2-shivankg@amd.com
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Shivank Garg <shivankg@amd.com>
+Suggested-by: Matthew Wilcox <willy@infradead.org>
+Co-developed-by: David Hildenbrand <david@redhat.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Dave Kleikamp <shaggy@kernel.org>
+Cc: Donet Tom <donettom@linux.ibm.com>
+Cc: Jane Chu <jane.chu@oracle.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ include/linux/mm.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
+ mm/migrate.c       | 22 ++++---------------
+ 2 files changed, 59 insertions(+), 18 deletions(-)
+
+--- a/include/linux/mm.h
+++ b/include/linux/mm.h
+@@ -2307,6 +2307,61 @@ static inline bool folio_maybe_mapped_sh
+ 	return folio_test_large_maybe_mapped_shared(folio);
+ }
+ 
+/**
+ * folio_expected_ref_count - calculate the expected folio refcount
+ * @folio: the folio
+ *
+ * Calculate the expected folio refcount, taking references from the pagecache,
+ * swapcache, PG_private and page table mappings into account. Useful in
+ * combination with folio_ref_count() to detect unexpected references (e.g.,
+ * GUP or other temporary references).
+ *
+ * Does currently not consider references from the LRU cache. If the folio
+ * was isolated from the LRU (which is the case during migration or split),
+ * the LRU cache does not apply.
+ *
+ * Calling this function on an unmapped folio -- !folio_mapped() -- that is
+ * locked will return a stable result.
+ *
+ * Calling this function on a mapped folio will not result in a stable result,
+ * because nothing stops additional page table mappings from coming (e.g.,
+ * fork()) or going (e.g., munmap()).
+ *
+ * Calling this function without the folio lock will also not result in a
+ * stable result: for example, the folio might get dropped from the swapcache
+ * concurrently.
+ *
+ * However, even when called without the folio lock or on a mapped folio,
+ * this function can be used to detect unexpected references early (for example,
+ * if it makes sense to even lock the folio and unmap it).
+ *
+ * The caller must add any reference (e.g., from folio_try_get()) it might be
+ * holding itself to the result.
+ *
+ * Returns the expected folio refcount.
+ */
+static inline int folio_expected_ref_count(const struct folio *folio)
+{
+	const int order = folio_order(folio);
+	int ref_count = 0;
+
+	if (WARN_ON_ONCE(folio_test_slab(folio)))
+		return 0;
+
+	if (folio_test_anon(folio)) {
+		/* One reference per page from the swapcache. */
+		ref_count += folio_test_swapcache(folio) << order;
+	} else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) {
+		/* One reference per page from the pagecache. */
+		ref_count += !!folio->mapping << order;
+		/* One reference from PG_private. */
+		ref_count += folio_test_private(folio);
+	}
+
+	/* One reference per page table mapping. */
+	return ref_count + folio_mapcount(folio);
+}
+
+ #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
+ static inline int arch_make_folio_accessible(struct folio *folio)
+ {
+--- a/mm/migrate.c
+++ b/mm/migrate.c
+@@ -445,20 +445,6 @@ unlock:
+ }
+ #endif
+ 
+-static int folio_expected_refs(struct address_space *mapping,
+-		struct folio *folio)
+-{
+-	int refs = 1;
+-	if (!mapping)
+-		return refs;
+-
+-	refs += folio_nr_pages(folio);
+-	if (folio_test_private(folio))
+-		refs++;
+-
+-	return refs;
+-}
+-
+ /*
+  * Replace the folio in the mapping.
+  *
+@@ -601,7 +587,7 @@ static int __folio_migrate_mapping(struc
+ int folio_migrate_mapping(struct address_space *mapping,
+ 		struct folio *newfolio, struct folio *folio, int extra_count)
+ {
+-	int expected_count = folio_expected_refs(mapping, folio) + extra_count;
+	int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
+ 
+ 	if (folio_ref_count(folio) != expected_count)
+ 		return -EAGAIN;
+@@ -618,7 +604,7 @@ int migrate_huge_page_move_mapping(struc
+ 				   struct folio *dst, struct folio *src)
+ {
+ 	XA_STATE(xas, &mapping->i_pages, folio_index(src));
+-	int rc, expected_count = folio_expected_refs(mapping, src);
+	int rc, expected_count = folio_expected_ref_count(src) + 1;
+ 
+ 	if (folio_ref_count(src) != expected_count)
+ 		return -EAGAIN;
+@@ -749,7 +735,7 @@ static int __migrate_folio(struct addres
+ 			   struct folio *src, void *src_private,
+ 			   enum migrate_mode mode)
+ {
+-	int rc, expected_count = folio_expected_refs(mapping, src);
+	int rc, expected_count = folio_expected_ref_count(src) + 1;
+ 
+ 	/* Check whether src does not have extra refs before we do more work */
+ 	if (folio_ref_count(src) != expected_count)
+@@ -837,7 +823,7 @@ static int __buffer_migrate_folio(struct
+ 		return migrate_folio(mapping, dst, src, mode);
+ 
+ 	/* Check whether page does not have extra refs before we do more work */
+-	expected_count = folio_expected_refs(mapping, src);
+	expected_count = folio_expected_ref_count(src) + 1;
+ 	if (folio_ref_count(src) != expected_count)
+ 		return -EAGAIN;
+ 
--- a/debian/patches/patchset-pf/fixes/0018-mm-fix-uprobe-pte-be-overwritten-when-expanding-vma.patch
+++ b/debian/patches/patchset-pf/fixes/0018-mm-fix-uprobe-pte-be-overwritten-when-expanding-vma.patch
@@ -0,0 +1,129 @@
+From 0f52f05148589fe4115322a9cc8ffab760091a0a Mon Sep 17 00:00:00 2001
+From: Pu Lehui <pulehui@huawei.com>
+Date: Thu, 29 May 2025 15:56:47 +0000
+Subject: mm: fix uprobe pte be overwritten when expanding vma
+
+Patch series "Fix uprobe pte be overwritten when expanding vma".
+
+
+This patch (of 4):
+
+We encountered a BUG alert triggered by Syzkaller as follows:
+   BUG: Bad rss-counter state mm:00000000b4a60fca type:MM_ANONPAGES val:1
+
+And we can reproduce it with the following steps:
+1. register uprobe on file at zero offset
+2. mmap the file at zero offset:
+   addr1 = mmap(NULL, 2 * 4096, PROT_NONE, MAP_PRIVATE, fd, 0);
+3. mremap part of vma1 to new vma2:
+   addr2 = mremap(addr1, 4096, 2 * 4096, MREMAP_MAYMOVE);
+4. mremap back to orig addr1:
+   mremap(addr2, 4096, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, addr1);
+
+In step 3, the vma1 range [addr1, addr1 + 4096] will be remap to new vma2
+with range [addr2, addr2 + 8192], and remap uprobe anon page from the vma1
+to vma2, then unmap the vma1 range [addr1, addr1 + 4096].
+
+In step 4, the vma2 range [addr2, addr2 + 4096] will be remap back to the
+addr range [addr1, addr1 + 4096].  Since the addr range [addr1 + 4096,
+addr1 + 8192] still maps the file, it will take vma_merge_new_range to
+expand the range, and then do uprobe_mmap in vma_complete.  Since the
+merged vma pgoff is also zero offset, it will install uprobe anon page to
+the merged vma.  However, the upcomming move_page_tables step, which use
+set_pte_at to remap the vma2 uprobe pte to the merged vma, will overwrite
+the newly uprobe pte in the merged vma, and lead that pte to be orphan.
+
+Since the uprobe pte will be remapped to the merged vma, we can remove the
+unnecessary uprobe_mmap upon merged vma.
+
+This problem was first found in linux-6.6.y and also exists in the
+community syzkaller:
+https://lore.kernel.org/all/000000000000ada39605a5e71711@google.com/T/
+
+Link: https://lkml.kernel.org/r/20250529155650.4017699-1-pulehui@huaweicloud.com
+Link: https://lkml.kernel.org/r/20250529155650.4017699-2-pulehui@huaweicloud.com
+Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ mm/vma.c | 20 +++++++++++++++++---
+ mm/vma.h |  7 +++++++
+ 2 files changed, 24 insertions(+), 3 deletions(-)
+
+--- a/mm/vma.c
+++ b/mm/vma.c
+@@ -144,6 +144,9 @@ static void init_multi_vma_prep(struct v
+ 	vp->file = vma->vm_file;
+ 	if (vp->file)
+ 		vp->mapping = vma->vm_file->f_mapping;
+
+	if (vmg && vmg->skip_vma_uprobe)
+		vp->skip_vma_uprobe = true;
+ }
+ 
+ /*
+@@ -333,10 +336,13 @@ static void vma_complete(struct vma_prep
+ 
+ 	if (vp->file) {
+ 		i_mmap_unlock_write(vp->mapping);
+-		uprobe_mmap(vp->vma);
+ 
+-		if (vp->adj_next)
+-			uprobe_mmap(vp->adj_next);
+		if (!vp->skip_vma_uprobe) {
+			uprobe_mmap(vp->vma);
+
+			if (vp->adj_next)
+				uprobe_mmap(vp->adj_next);
+		}
+ 	}
+ 
+ 	if (vp->remove) {
+@@ -1783,6 +1789,14 @@ struct vm_area_struct *copy_vma(struct v
+ 		faulted_in_anon_vma = false;
+ 	}
+ 
+	/*
+	 * If the VMA we are copying might contain a uprobe PTE, ensure
+	 * that we do not establish one upon merge. Otherwise, when mremap()
+	 * moves page tables, it will orphan the newly created PTE.
+	 */
+	if (vma->vm_file)
+		vmg.skip_vma_uprobe = true;
+
+ 	new_vma = find_vma_prev(mm, addr, &vmg.prev);
+ 	if (new_vma && new_vma->vm_start < addr + len)
+ 		return NULL;	/* should never get here */
+--- a/mm/vma.h
+++ b/mm/vma.h
+@@ -19,6 +19,8 @@ struct vma_prepare {
+ 	struct vm_area_struct *insert;
+ 	struct vm_area_struct *remove;
+ 	struct vm_area_struct *remove2;
+
+	bool skip_vma_uprobe :1;
+ };
+ 
+ struct unlink_vma_file_batch {
+@@ -120,6 +122,11 @@ struct vma_merge_struct {
+ 	 */
+ 	bool give_up_on_oom :1;
+ 
+	/*
+	 * If set, skip uprobe_mmap upon merged vma.
+	 */
+	bool skip_vma_uprobe :1;
+
+ 	/* Internal flags set during merge process: */
+ 
+ 	/*
--- a/debian/patches/patchset-pf/fixes/0019-mm-hugetlb-unshare-page-tables-during-VMA-split-not-.patch
+++ b/debian/patches/patchset-pf/fixes/0019-mm-hugetlb-unshare-page-tables-during-VMA-split-not-.patch
@@ -0,0 +1,217 @@
+From 6f1e03b94f7777323aaefd9286d992a1cbd0adf7 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 27 May 2025 23:23:53 +0200
+Subject: mm/hugetlb: unshare page tables during VMA split, not before
+
+Currently, __split_vma() triggers hugetlb page table unsharing through
+vm_ops->may_split().  This happens before the VMA lock and rmap locks are
+taken - which is too early, it allows racing VMA-locked page faults in our
+process and racing rmap walks from other processes to cause page tables to
+be shared again before we actually perform the split.
+
+Fix it by explicitly calling into the hugetlb unshare logic from
+__split_vma() in the same place where THP splitting also happens.  At that
+point, both the VMA and the rmap(s) are write-locked.
+
+An annoying detail is that we can now call into the helper
+hugetlb_unshare_pmds() from two different locking contexts:
+
+1. from hugetlb_split(), holding:
+    - mmap lock (exclusively)
+    - VMA lock
+    - file rmap lock (exclusively)
+2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
+   call us with only the mmap lock held (in shared mode), but currently
+   only runs while holding mmap lock (exclusively) and VMA lock
+
+Backporting note:
+This commit fixes a racy protection that was introduced in commit
+b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
+commit claimed to fix an issue introduced in 5.13, but it should actually
+also go all the way back.
+
+[jannh@google.com: v2]
+  Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
+Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
+Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
+Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
+Signed-off-by: Jann Horn <jannh@google.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>	[b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ include/linux/hugetlb.h          |  3 ++
+ mm/hugetlb.c                     | 60 +++++++++++++++++++++++---------
+ mm/vma.c                         |  7 ++++
+ tools/testing/vma/vma_internal.h |  2 ++
+ 4 files changed, 56 insertions(+), 16 deletions(-)
+
+--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
+@@ -276,6 +276,7 @@ bool is_hugetlb_entry_migration(pte_t pt
+ bool is_hugetlb_entry_hwpoisoned(pte_t pte);
+ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
+ void fixup_hugetlb_reservations(struct vm_area_struct *vma);
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
+ 
+ #else /* !CONFIG_HUGETLB_PAGE */
+ 
+@@ -473,6 +474,8 @@ static inline void fixup_hugetlb_reserva
+ {
+ }
+ 
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
+
+ #endif /* !CONFIG_HUGETLB_PAGE */
+ 
+ #ifndef pgd_write
+--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
+@@ -120,7 +120,7 @@ static void hugetlb_vma_lock_free(struct
+ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
+ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+-		unsigned long start, unsigned long end);
+		unsigned long start, unsigned long end, bool take_locks);
+ static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
+ 
+ static void hugetlb_free_folio(struct folio *folio)
+@@ -5426,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm
+ {
+ 	if (addr & ~(huge_page_mask(hstate_vma(vma))))
+ 		return -EINVAL;
+	return 0;
+}
+ 
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ 	/*
+ 	 * PMD sharing is only possible for PUD_SIZE-aligned address ranges
+ 	 * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
+ 	 * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+	 * This function is called in the middle of a VMA split operation, with
+	 * MM, VMA and rmap all write-locked to prevent concurrent page table
+	 * walks (except hardware and gup_fast()).
+ 	 */
+	vma_assert_write_locked(vma);
+	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ 	if (addr & ~PUD_MASK) {
+-		/*
+-		 * hugetlb_vm_op_split is called right before we attempt to
+-		 * split the VMA. We will need to unshare PMDs in the old and
+-		 * new VMAs, so let's unshare before we split.
+-		 */
+ 		unsigned long floor = addr & PUD_MASK;
+ 		unsigned long ceil = floor + PUD_SIZE;
+ 
+-		if (floor >= vma->vm_start && ceil <= vma->vm_end)
+-			hugetlb_unshare_pmds(vma, floor, ceil);
+		if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+			/*
+			 * Locking:
+			 * Use take_locks=false here.
+			 * The file rmap lock is already held.
+			 * The hugetlb VMA lock can't be taken when we already
+			 * hold the file rmap lock, and we don't need it because
+			 * its purpose is to synchronize against concurrent page
+			 * table walks, which are not possible thanks to the
+			 * locks held by our caller.
+			 */
+			hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+		}
+ 	}
+-
+-	return 0;
+ }
+ 
+ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
+@@ -7884,9 +7898,16 @@ void move_hugetlb_state(struct folio *ol
+ 	spin_unlock_irq(&hugetlb_lock);
+ }
+ 
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
+ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
+ 				   unsigned long start,
+-				   unsigned long end)
+				   unsigned long end,
+				   bool take_locks)
+ {
+ 	struct hstate *h = hstate_vma(vma);
+ 	unsigned long sz = huge_page_size(h);
+@@ -7910,8 +7931,12 @@ static void hugetlb_unshare_pmds(struct
+ 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ 				start, end);
+ 	mmu_notifier_invalidate_range_start(&range);
+-	hugetlb_vma_lock_write(vma);
+-	i_mmap_lock_write(vma->vm_file->f_mapping);
+	if (take_locks) {
+		hugetlb_vma_lock_write(vma);
+		i_mmap_lock_write(vma->vm_file->f_mapping);
+	} else {
+		i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+	}
+ 	for (address = start; address < end; address += PUD_SIZE) {
+ 		ptep = hugetlb_walk(vma, address, sz);
+ 		if (!ptep)
+@@ -7921,8 +7946,10 @@ static void hugetlb_unshare_pmds(struct
+ 		spin_unlock(ptl);
+ 	}
+ 	flush_hugetlb_tlb_range(vma, start, end);
+-	i_mmap_unlock_write(vma->vm_file->f_mapping);
+-	hugetlb_vma_unlock_write(vma);
+	if (take_locks) {
+		i_mmap_unlock_write(vma->vm_file->f_mapping);
+		hugetlb_vma_unlock_write(vma);
+	}
+ 	/*
+ 	 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
+ 	 * Documentation/mm/mmu_notifier.rst.
+@@ -7937,7 +7964,8 @@ static void hugetlb_unshare_pmds(struct
+ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+ {
+ 	hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
+-			ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+			ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+			/* take_locks = */ true);
+ }
+ 
+ /*
+--- a/mm/vma.c
+++ b/mm/vma.c
+@@ -516,7 +516,14 @@ __split_vma(struct vma_iterator *vmi, st
+ 	init_vma_prep(&vp, vma);
+ 	vp.insert = new;
+ 	vma_prepare(&vp);
+
+	/*
+	 * Get rid of huge pages and shared page tables straddling the split
+	 * boundary.
+	 */
+ 	vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_split(vma, addr);
+ 
+ 	if (new_below) {
+ 		vma->vm_start = addr;
+--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
+@@ -793,6 +793,8 @@ static inline void vma_adjust_trans_huge
+ 	(void)next;
+ }
+ 
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
+
+ static inline void vma_iter_free(struct vma_iterator *vmi)
+ {
+ 	mas_destroy(&vmi->mas);
--- a/debian/patches/patchset-pf/fixes/0020-mm-hugetlb-fix-huge_pmd_unshare-vs-GUP-fast-race.patch
+++ b/debian/patches/patchset-pf/fixes/0020-mm-hugetlb-fix-huge_pmd_unshare-vs-GUP-fast-race.patch
@@ -0,0 +1,50 @@
+From cbd0e47470ea4db11acf3612edf91b5047a90d24 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Tue, 27 May 2025 23:23:54 +0200
+Subject: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race
+
+huge_pmd_unshare() drops a reference on a page table that may have
+previously been shared across processes, potentially turning it into a
+normal page table used in another process in which unrelated VMAs can
+afterwards be installed.
+
+If this happens in the middle of a concurrent gup_fast(), gup_fast() could
+end up walking the page tables of another process.  While I don't see any
+way in which that immediately leads to kernel memory corruption, it is
+really weird and unexpected.
+
+Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(),
+just like we do in khugepaged when removing page tables for a THP
+collapse.
+
+Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-2-1329349bad1a@google.com
+Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-2-f4136f5ec58a@google.com
+Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
+Signed-off-by: Jann Horn <jannh@google.com>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ mm/hugetlb.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
+@@ -7628,6 +7628,13 @@ int huge_pmd_unshare(struct mm_struct *m
+ 		return 0;
+ 
+ 	pud_clear(pud);
+	/*
+	 * Once our caller drops the rmap lock, some other process might be
+	 * using this page table as a normal, non-hugetlb page table.
+	 * Wait for pending gup_fast() in other threads to finish before letting
+	 * that happen.
+	 */
+	tlb_remove_table_sync_one();
+ 	ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+ 	mm_dec_nr_pmds(mm);
+ 	return 1;
--- a/debian/patches/patchset-pf/fixes/0021-mm-madvise-handle-madvise_lock-failure-during-race-u.patch
+++ b/debian/patches/patchset-pf/fixes/0021-mm-madvise-handle-madvise_lock-failure-during-race-u.patch
@@ -0,0 +1,48 @@
+From cb42e10062f07934d60ce2a9bc154ea7ac0bab5a Mon Sep 17 00:00:00 2001
+From: SeongJae Park <sj@kernel.org>
+Date: Mon, 2 Jun 2025 10:49:26 -0700
+Subject: mm/madvise: handle madvise_lock() failure during race unwinding
+
+When unwinding race on -ERESTARTNOINTR handling of process_madvise(),
+madvise_lock() failure is ignored.  Check the failure and abort remaining
+works in the case.
+
+Link: https://lkml.kernel.org/r/20250602174926.1074-1-sj@kernel.org
+Fixes: 4000e3d0a367 ("mm/madvise: remove redundant mmap_lock operations from process_madvise()")
+Signed-off-by: SeongJae Park <sj@kernel.org>
+Reported-by: Barry Song <21cnbao@gmail.com>
+Closes: https://lore.kernel.org/CAGsJ_4xJXXO0G+4BizhohSZ4yDteziPw43_uF8nPXPWxUVChzw@mail.gmail.com
+Reviewed-by: Jann Horn <jannh@google.com>
+Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
+Reviewed-by: Barry Song <baohua@kernel.org>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ mm/madvise.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/madvise.c
+++ b/mm/madvise.c
+@@ -1830,7 +1830,9 @@ static ssize_t vector_madvise(struct mm_
+ 
+ 			/* Drop and reacquire lock to unwind race. */
+ 			madvise_unlock(mm, behavior);
+-			madvise_lock(mm, behavior);
+			ret = madvise_lock(mm, behavior);
+			if (ret)
+				goto out;
+ 			continue;
+ 		}
+ 		if (ret < 0)
+@@ -1839,6 +1841,7 @@ static ssize_t vector_madvise(struct mm_
+ 	}
+ 	madvise_unlock(mm, behavior);
+ 
+out:
+ 	ret = (total_len - iov_iter_count(iter)) ? : ret;
+ 
+ 	return ret;
--- a/debian/patches/patchset-pf/fixes/0022-video-screen_info-Relocate-framebuffers-behind-PCI-b.patch
+++ b/debian/patches/patchset-pf/fixes/0022-video-screen_info-Relocate-framebuffers-behind-PCI-b.patch
@@ -0,0 +1,164 @@
+From 0aeb6f83ff11709bb4b6fc9afa2f742681ca36e1 Mon Sep 17 00:00:00 2001
+From: Thomas Zimmermann <tzimmermann@suse.de>
+Date: Wed, 28 May 2025 10:02:08 +0200
+Subject: video: screen_info: Relocate framebuffers behind PCI bridges
+
+Apply PCI host-bridge window offsets to screen_info framebuffers. Fixes
+invalid access to I/O memory.
+
+Resources behind a PCI host bridge can be relocated by a certain offset
+in the kernel's CPU address range used for I/O. The framebuffer memory
+range stored in screen_info refers to the CPU addresses as seen during
+boot (where the offset is 0). During boot up, firmware may assign a
+different memory offset to the PCI host bridge and thereby relocating
+the framebuffer address of the PCI graphics device as seen by the kernel.
+The information in screen_info must be updated as well.
+
+The helper pcibios_bus_to_resource() performs the relocation of the
+screen_info's framebuffer resource (given in PCI bus addresses). The
+result matches the I/O-memory resource of the PCI graphics device (given
+in CPU addresses). As before, we store away the information necessary to
+later update the information in screen_info itself.
+
+Commit 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated
+EFI framebuffers") added the code for updating screen_info. It is based
+on similar functionality that pre-existed in efifb. Efifb uses a pointer
+to the PCI resource, while the newer code does a memcpy of the region.
+Hence efifb sees any updates to the PCI resource and avoids the issue.
+
+v3:
+- Only use struct pci_bus_region for PCI bus addresses (Bjorn)
+- Clarify address semantics in commit messages and comments (Bjorn)
+v2:
+- Fixed tags (Takashi, Ivan)
+- Updated information on efifb
+
+Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
+Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
+Reported-by: "Ivan T. Ivanov" <iivanov@suse.de>
+Closes: https://bugzilla.suse.com/show_bug.cgi?id=1240696
+Tested-by: "Ivan T. Ivanov" <iivanov@suse.de>
+Fixes: 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated EFI framebuffers")
+Cc: dri-devel@lists.freedesktop.org
+Cc: <stable@vger.kernel.org> # v6.9+
+Link: https://lore.kernel.org/r/20250528080234.7380-1-tzimmermann@suse.de
+---
+ drivers/video/screen_info_pci.c | 79 +++++++++++++++++++++------------
+ 1 file changed, 50 insertions(+), 29 deletions(-)
+
+--- a/drivers/video/screen_info_pci.c
+++ b/drivers/video/screen_info_pci.c
+@@ -7,8 +7,8 @@
+ 
+ static struct pci_dev *screen_info_lfb_pdev;
+ static size_t screen_info_lfb_bar;
+-static resource_size_t screen_info_lfb_offset;
+-static struct resource screen_info_lfb_res = DEFINE_RES_MEM(0, 0);
+static resource_size_t screen_info_lfb_res_start; // original start of resource
+static resource_size_t screen_info_lfb_offset; // framebuffer offset within resource
+ 
+ static bool __screen_info_relocation_is_valid(const struct screen_info *si, struct resource *pr)
+ {
+@@ -31,7 +31,7 @@ void screen_info_apply_fixups(void)
+ 	if (screen_info_lfb_pdev) {
+ 		struct resource *pr = &screen_info_lfb_pdev->resource[screen_info_lfb_bar];
+ 
+-		if (pr->start != screen_info_lfb_res.start) {
+		if (pr->start != screen_info_lfb_res_start) {
+ 			if (__screen_info_relocation_is_valid(si, pr)) {
+ 				/*
+ 				 * Only update base if we have an actual
+@@ -47,46 +47,67 @@ void screen_info_apply_fixups(void)
+ 	}
+ }
+ 
+static int __screen_info_lfb_pci_bus_region(const struct screen_info *si, unsigned int type,
+					    struct pci_bus_region *r)
+{
+	u64 base, size;
+
+	base = __screen_info_lfb_base(si);
+	if (!base)
+		return -EINVAL;
+
+	size = __screen_info_lfb_size(si, type);
+	if (!size)
+		return -EINVAL;
+
+	r->start = base;
+	r->end = base + size - 1;
+
+	return 0;
+}
+
+ static void screen_info_fixup_lfb(struct pci_dev *pdev)
+ {
+ 	unsigned int type;
+-	struct resource res[SCREEN_INFO_MAX_RESOURCES];
+-	size_t i, numres;
+	struct pci_bus_region bus_region;
+ 	int ret;
+	struct resource r = {
+		.flags = IORESOURCE_MEM,
+	};
+	const struct resource *pr;
+ 	const struct screen_info *si = &screen_info;
+ 
+ 	if (screen_info_lfb_pdev)
+ 		return; // already found
+ 
+ 	type = screen_info_video_type(si);
+-	if (type != VIDEO_TYPE_EFI)
+-		return; // only applies to EFI
+	if (!__screen_info_has_lfb(type))
+		return; // only applies to EFI; maybe VESA
+ 
+-	ret = screen_info_resources(si, res, ARRAY_SIZE(res));
+	ret = __screen_info_lfb_pci_bus_region(si, type, &bus_region);
+ 	if (ret < 0)
+ 		return;
+-	numres = ret;
+ 
+-	for (i = 0; i < numres; ++i) {
+-		struct resource *r = &res[i];
+-		const struct resource *pr;
+-
+-		if (!(r->flags & IORESOURCE_MEM))
+-			continue;
+-		pr = pci_find_resource(pdev, r);
+-		if (!pr)
+-			continue;
+-
+-		/*
+-		 * We've found a PCI device with the framebuffer
+-		 * resource. Store away the parameters to track
+-		 * relocation of the framebuffer aperture.
+-		 */
+-		screen_info_lfb_pdev = pdev;
+-		screen_info_lfb_bar = pr - pdev->resource;
+-		screen_info_lfb_offset = r->start - pr->start;
+-		memcpy(&screen_info_lfb_res, r, sizeof(screen_info_lfb_res));
+-	}
+	/*
+	 * Translate the PCI bus address to resource. Account
+	 * for an offset if the framebuffer is behind a PCI host
+	 * bridge.
+	 */
+	pcibios_bus_to_resource(pdev->bus, &r, &bus_region);
+
+	pr = pci_find_resource(pdev, &r);
+	if (!pr)
+		return;
+
+	/*
+	 * We've found a PCI device with the framebuffer
+	 * resource. Store away the parameters to track
+	 * relocation of the framebuffer aperture.
+	 */
+	screen_info_lfb_pdev = pdev;
+	screen_info_lfb_bar = pr - pdev->resource;
+	screen_info_lfb_offset = r.start - pr->start;
+	screen_info_lfb_res_start = bus_region.start;
+ }
+ DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID, PCI_BASE_CLASS_DISPLAY, 16,
+ 			       screen_info_fixup_lfb);
--- a/debian/patches/patchset-pf/fixes/0023-sysfb-Fix-screen_info-type-check-for-VGA.patch
+++ b/debian/patches/patchset-pf/fixes/0023-sysfb-Fix-screen_info-type-check-for-VGA.patch
@@ -0,0 +1,86 @@
+From 06ff725d11ea8713876187973c834fb595cb26f1 Mon Sep 17 00:00:00 2001
+From: Thomas Zimmermann <tzimmermann@suse.de>
+Date: Tue, 3 Jun 2025 17:48:20 +0200
+Subject: sysfb: Fix screen_info type check for VGA
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Use the helper screen_info_video_type() to get the framebuffer
+type from struct screen_info. Handle supported values in sorted
+switch statement.
+
+Reading orig_video_isVGA is unreliable. On most systems it is a
+VIDEO_TYPE_ constant. On some systems with VGA it is simply set
+to 1 to signal the presence of a VGA output. See vga_probe() for
+an example. Retrieving the screen_info type with the helper
+screen_info_video_type() detects these cases and returns the
+appropriate VIDEO_TYPE_ constant. For VGA, sysfb creates a device
+named "vga-framebuffer".
+
+The sysfb code has been taken from vga16fb, where it likely didn't
+work correctly either. With this bugfix applied, vga16fb loads for
+compatible vga-framebuffer devices.
+
+Fixes: 0db5b61e0dc0 ("fbdev/vga16fb: Create EGA/VGA devices in sysfb code")
+Cc: Thomas Zimmermann <tzimmermann@suse.de>
+Cc: Javier Martinez Canillas <javierm@redhat.com>
+Cc: Alex Deucher <alexander.deucher@amd.com>
+Cc: Tzung-Bi Shih <tzungbi@kernel.org>
+Cc: Helge Deller <deller@gmx.de>
+Cc: "Uwe Kleine-König" <u.kleine-koenig@baylibre.com>
+Cc: Zsolt Kajtar <soci@c64.rulez.org>
+Cc: <stable@vger.kernel.org> # v6.1+
+Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
+Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
+Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
+Link: https://lore.kernel.org/r/20250603154838.401882-1-tzimmermann@suse.de
+---
+ drivers/firmware/sysfb.c | 26 ++++++++++++++++++--------
+ 1 file changed, 18 insertions(+), 8 deletions(-)
+
+--- a/drivers/firmware/sysfb.c
+++ b/drivers/firmware/sysfb.c
+@@ -143,6 +143,7 @@ static __init int sysfb_init(void)
+ {
+ 	struct screen_info *si = &screen_info;
+ 	struct device *parent;
+	unsigned int type;
+ 	struct simplefb_platform_data mode;
+ 	const char *name;
+ 	bool compatible;
+@@ -170,17 +171,26 @@ static __init int sysfb_init(void)
+ 			goto put_device;
+ 	}
+ 
+	type = screen_info_video_type(si);
+
+ 	/* if the FB is incompatible, create a legacy framebuffer device */
+-	if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
+-		name = "efi-framebuffer";
+-	else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
+-		name = "vesa-framebuffer";
+-	else if (si->orig_video_isVGA == VIDEO_TYPE_VGAC)
+-		name = "vga-framebuffer";
+-	else if (si->orig_video_isVGA == VIDEO_TYPE_EGAC)
+	switch (type) {
+	case VIDEO_TYPE_EGAC:
+ 		name = "ega-framebuffer";
+-	else
+		break;
+	case VIDEO_TYPE_VGAC:
+		name = "vga-framebuffer";
+		break;
+	case VIDEO_TYPE_VLFB:
+		name = "vesa-framebuffer";
+		break;
+	case VIDEO_TYPE_EFI:
+		name = "efi-framebuffer";
+		break;
+	default:
+ 		name = "platform-framebuffer";
+		break;
+	}
+ 
+ 	pd = platform_device_alloc(name, 0);
+ 	if (!pd) {
--- a/debian/patches/patchset-pf/fixes/0024-x86-iopl-Cure-TIF_IO_BITMAP-inconsistencies.patch
+++ b/debian/patches/patchset-pf/fixes/0024-x86-iopl-Cure-TIF_IO_BITMAP-inconsistencies.patch
@@ -0,0 +1,113 @@
+From ba4c83076943b477c90015581cc88e262a7d772f Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed, 26 Feb 2025 16:01:57 +0100
+Subject: x86/iopl: Cure TIF_IO_BITMAP inconsistencies
+
+io_bitmap_exit() is invoked from exit_thread() when a task exists or
+when a fork fails. In the latter case the exit_thread() cleans up
+resources which were allocated during fork().
+
+io_bitmap_exit() invokes task_update_io_bitmap(), which in turn ends up
+in tss_update_io_bitmap(). tss_update_io_bitmap() operates on the
+current task. If current has TIF_IO_BITMAP set, but no bitmap installed,
+tss_update_io_bitmap() crashes with a NULL pointer dereference.
+
+There are two issues, which lead to that problem:
+
+  1) io_bitmap_exit() should not invoke task_update_io_bitmap() when
+     the task, which is cleaned up, is not the current task. That's a
+     clear indicator for a cleanup after a failed fork().
+
+  2) A task should not have TIF_IO_BITMAP set and neither a bitmap
+     installed nor IOPL emulation level 3 activated.
+
+     This happens when a kernel thread is created in the context of
+     a user space thread, which has TIF_IO_BITMAP set as the thread
+     flags are copied and the IO bitmap pointer is cleared.
+
+     Other than in the failed fork() case this has no impact because
+     kernel threads including IO workers never return to user space and
+     therefore never invoke tss_update_io_bitmap().
+
+Cure this by adding the missing cleanups and checks:
+
+  1) Prevent io_bitmap_exit() to invoke task_update_io_bitmap() if
+     the to be cleaned up task is not the current task.
+
+  2) Clear TIF_IO_BITMAP in copy_thread() unconditionally. For user
+     space forks it is set later, when the IO bitmap is inherited in
+     io_bitmap_share().
+
+For paranoia sake, add a warning into tss_update_io_bitmap() to catch
+the case, when that code is invoked with inconsistent state.
+
+Fixes: ea5f1cd7ab49 ("x86/ioperm: Remove bitmap if all permissions dropped")
+Reported-by: syzbot+e2b1803445d236442e54@syzkaller.appspotmail.com
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/87wmdceom2.ffs@tglx
+---
+ arch/x86/kernel/ioport.c  | 13 +++++++++----
+ arch/x86/kernel/process.c |  6 ++++++
+ 2 files changed, 15 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
+@@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct
+ 	set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
+ }
+ 
+-static void task_update_io_bitmap(struct task_struct *tsk)
+static void task_update_io_bitmap(void)
+ {
+	struct task_struct *tsk = current;
+ 	struct thread_struct *t = &tsk->thread;
+ 
+ 	if (t->iopl_emul == 3 || t->io_bitmap) {
+@@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *
+ 	struct io_bitmap *iobm = tsk->thread.io_bitmap;
+ 
+ 	tsk->thread.io_bitmap = NULL;
+-	task_update_io_bitmap(tsk);
+	/*
+	 * Don't touch the TSS when invoked on a failed fork(). TSS
+	 * reflects the state of @current and not the state of @tsk.
+	 */
+	if (tsk == current)
+		task_update_io_bitmap();
+ 	if (iobm && refcount_dec_and_test(&iobm->refcnt))
+ 		kfree(iobm);
+ }
+@@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, leve
+ 	}
+ 
+ 	t->iopl_emul = level;
+-	task_update_io_bitmap(current);
+-
+	task_update_io_bitmap();
+ 	return 0;
+ }
+ 
+--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
+@@ -181,6 +181,7 @@ int copy_thread(struct task_struct *p, c
+ 	frame->ret_addr = (unsigned long) ret_from_fork_asm;
+ 	p->thread.sp = (unsigned long) fork_frame;
+ 	p->thread.io_bitmap = NULL;
+	clear_tsk_thread_flag(p, TIF_IO_BITMAP);
+ 	p->thread.iopl_warn = 0;
+ 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+ 
+@@ -469,6 +470,11 @@ void native_tss_update_io_bitmap(void)
+ 	} else {
+ 		struct io_bitmap *iobm = t->io_bitmap;
+ 
+		if (WARN_ON_ONCE(!iobm)) {
+			clear_thread_flag(TIF_IO_BITMAP);
+			native_tss_invalidate_io_bitmap();
+		}
+
+ 		/*
+ 		 * Only copy bitmap data when the sequence number differs. The
+ 		 * update time is accounted to the incoming task.
--- a/debian/patches/patchset-pf/fixes/0025-watchdog-fix-watchdog-may-detect-false-positive-of-s.patch
+++ b/debian/patches/patchset-pf/fixes/0025-watchdog-fix-watchdog-may-detect-false-positive-of-s.patch
@@ -0,0 +1,200 @@
+From 7856e6900a09ed537366a5e0c774be8926ee022e Mon Sep 17 00:00:00 2001
+From: Luo Gengkun <luogengkun@huaweicloud.com>
+Date: Mon, 21 Apr 2025 03:50:21 +0000
+Subject: watchdog: fix watchdog may detect false positive of softlockup
+
+When updating `watchdog_thresh`, there is a race condition between writing
+the new `watchdog_thresh` value and stopping the old watchdog timer.  If
+the old timer triggers during this window, it may falsely detect a
+softlockup due to the old interval and the new `watchdog_thresh` value
+being used.  The problem can be described as follow:
+
+ # We asuume previous watchdog_thresh is 60, so the watchdog timer is
+ # coming every 24s.
+echo 10 > /proc/sys/kernel/watchdog_thresh (User space)
+|
+------>+ update watchdog_thresh (We are in kernel now)
+	|
+	|	  # using old interval and new `watchdog_thresh`
+	+------>+ watchdog hrtimer (irq context: detect softlockup)
+		|
+		|
+	+-------+
+	|
+	|
+	+ softlockup_stop_all
+
+To fix this problem, introduce a shadow variable for `watchdog_thresh`.
+The update to the actual `watchdog_thresh` is delayed until after the old
+timer is stopped, preventing false positives.
+
+The following testcase may help to understand this problem.
+
+---------------------------------------------
+echo RT_RUNTIME_SHARE > /sys/kernel/debug/sched/features
+echo -1 > /proc/sys/kernel/sched_rt_runtime_us
+echo 0 > /sys/kernel/debug/sched/fair_server/cpu3/runtime
+echo 60 > /proc/sys/kernel/watchdog_thresh
+taskset -c 3 chrt -r 99 /bin/bash -c "while true;do true; done" &
+echo 10 > /proc/sys/kernel/watchdog_thresh &
+---------------------------------------------
+
+The test case above first removes the throttling restrictions for
+real-time tasks.  It then sets watchdog_thresh to 60 and executes a
+real-time task ,a simple while(1) loop, on cpu3.  Consequently, the final
+command gets blocked because the presence of this real-time thread
+prevents kworker:3 from being selected by the scheduler.  This eventually
+triggers a softlockup detection on cpu3 due to watchdog_timer_fn operating
+with inconsistent variable - using both the old interval and the updated
+watchdog_thresh simultaneously.
+
+[nysal@linux.ibm.com: fix the SOFTLOCKUP_DETECTOR=n case]
+  Link: https://lkml.kernel.org/r/20250502111120.282690-1-nysal@linux.ibm.com
+Link: https://lkml.kernel.org/r/20250421035021.3507649-1-luogengkun@huaweicloud.com
+Signed-off-by: Luo Gengkun <luogengkun@huaweicloud.com>
+Signed-off-by: Nysal Jan K.A. <nysal@linux.ibm.com>
+Cc: Doug Anderson <dianders@chromium.org>
+Cc: Joel Granados <joel.granados@kernel.org>
+Cc: Song Liu <song@kernel.org>
+Cc: Thomas Gleinxer <tglx@linutronix.de>
+Cc: "Nysal Jan K.A." <nysal@linux.ibm.com>
+Cc: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ kernel/watchdog.c | 41 +++++++++++++++++++++++++++--------------
+ 1 file changed, 27 insertions(+), 14 deletions(-)
+
+--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
+@@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled
+ static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
+ static int __read_mostly watchdog_softlockup_user_enabled = 1;
+ int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_thresh_next;
+ static int __read_mostly watchdog_hardlockup_available;
+ 
+ struct cpumask watchdog_cpumask __read_mostly;
+@@ -870,12 +871,20 @@ int lockup_detector_offline_cpu(unsigned
+ 	return 0;
+ }
+ 
+-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
+ {
+ 	cpus_read_lock();
+ 	watchdog_hardlockup_stop();
+ 
+ 	softlockup_stop_all();
+	/*
+	 * To prevent watchdog_timer_fn from using the old interval and
+	 * the new watchdog_thresh at the same time, which could lead to
+	 * false softlockup reports, it is necessary to update the
+	 * watchdog_thresh after the softlockup is completed.
+	 */
+	if (thresh_changed)
+		watchdog_thresh = READ_ONCE(watchdog_thresh_next);
+ 	set_sample_period();
+ 	lockup_detector_update_enable();
+ 	if (watchdog_enabled && watchdog_thresh)
+@@ -888,7 +897,7 @@ static void __lockup_detector_reconfigur
+ void lockup_detector_reconfigure(void)
+ {
+ 	mutex_lock(&watchdog_mutex);
+-	__lockup_detector_reconfigure();
+	__lockup_detector_reconfigure(false);
+ 	mutex_unlock(&watchdog_mutex);
+ }
+ 
+@@ -908,27 +917,29 @@ static __init void lockup_detector_setup
+ 		return;
+ 
+ 	mutex_lock(&watchdog_mutex);
+-	__lockup_detector_reconfigure();
+	__lockup_detector_reconfigure(false);
+ 	softlockup_initialized = true;
+ 	mutex_unlock(&watchdog_mutex);
+ }
+ 
+ #else /* CONFIG_SOFTLOCKUP_DETECTOR */
+-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
+ {
+ 	cpus_read_lock();
+ 	watchdog_hardlockup_stop();
+	if (thresh_changed)
+		watchdog_thresh = READ_ONCE(watchdog_thresh_next);
+ 	lockup_detector_update_enable();
+ 	watchdog_hardlockup_start();
+ 	cpus_read_unlock();
+ }
+ void lockup_detector_reconfigure(void)
+ {
+-	__lockup_detector_reconfigure();
+	__lockup_detector_reconfigure(false);
+ }
+ static inline void lockup_detector_setup(void)
+ {
+-	__lockup_detector_reconfigure();
+	__lockup_detector_reconfigure(false);
+ }
+ #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
+ 
+@@ -946,11 +957,11 @@ void lockup_detector_soft_poweroff(void)
+ #ifdef CONFIG_SYSCTL
+ 
+ /* Propagate any changes to the watchdog infrastructure */
+-static void proc_watchdog_update(void)
+static void proc_watchdog_update(bool thresh_changed)
+ {
+ 	/* Remove impossible cpus to keep sysctl output clean. */
+ 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
+-	__lockup_detector_reconfigure();
+	__lockup_detector_reconfigure(thresh_changed);
+ }
+ 
+ /*
+@@ -984,7 +995,7 @@ static int proc_watchdog_common(int whic
+ 	} else {
+ 		err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ 		if (!err && old != READ_ONCE(*param))
+-			proc_watchdog_update();
+			proc_watchdog_update(false);
+ 	}
+ 	mutex_unlock(&watchdog_mutex);
+ 	return err;
+@@ -1035,11 +1046,13 @@ static int proc_watchdog_thresh(const st
+ 
+ 	mutex_lock(&watchdog_mutex);
+ 
+-	old = READ_ONCE(watchdog_thresh);
+	watchdog_thresh_next = READ_ONCE(watchdog_thresh);
+
+	old = watchdog_thresh_next;
+ 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ 
+-	if (!err && write && old != READ_ONCE(watchdog_thresh))
+-		proc_watchdog_update();
+	if (!err && write && old != READ_ONCE(watchdog_thresh_next))
+		proc_watchdog_update(true);
+ 
+ 	mutex_unlock(&watchdog_mutex);
+ 	return err;
+@@ -1060,7 +1073,7 @@ static int proc_watchdog_cpumask(const s
+ 
+ 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
+ 	if (!err && write)
+-		proc_watchdog_update();
+		proc_watchdog_update(false);
+ 
+ 	mutex_unlock(&watchdog_mutex);
+ 	return err;
+@@ -1080,7 +1093,7 @@ static const struct ctl_table watchdog_s
+ 	},
+ 	{
+ 		.procname	= "watchdog_thresh",
+-		.data		= &watchdog_thresh,
+		.data		= &watchdog_thresh_next,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_watchdog_thresh,
--- a/debian/patches/patchset-pf/fixes/0026-sched-rt-Fix-race-in-push_rt_task.patch
+++ b/debian/patches/patchset-pf/fixes/0026-sched-rt-Fix-race-in-push_rt_task.patch
@@ -0,0 +1,288 @@
+From 45c6602b7fa2a9dfd05a1f9289504c2437205ce4 Mon Sep 17 00:00:00 2001
+From: Harshit Agarwal <harshit@nutanix.com>
+Date: Tue, 25 Feb 2025 18:05:53 +0000
+Subject: sched/rt: Fix race in push_rt_task
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Overview
+========
+When a CPU chooses to call push_rt_task and picks a task to push to
+another CPU's runqueue then it will call find_lock_lowest_rq method
+which would take a double lock on both CPUs' runqueues. If one of the
+locks aren't readily available, it may lead to dropping the current
+runqueue lock and reacquiring both the locks at once. During this window
+it is possible that the task is already migrated and is running on some
+other CPU. These cases are already handled. However, if the task is
+migrated and has already been executed and another CPU is now trying to
+wake it up (ttwu) such that it is queued again on the runqeue
+(on_rq is 1) and also if the task was run by the same CPU, then the
+current checks will pass even though the task was migrated out and is no
+longer in the pushable tasks list.
+
+Crashes
+=======
+This bug resulted in quite a few flavors of crashes triggering kernel
+panics with various crash signatures such as assert failures, page
+faults, null pointer dereferences, and queue corruption errors all
+coming from scheduler itself.
+
+Some of the crashes:
+-> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO)
+   Call Trace:
+   ? __die_body+0x1a/0x60
+   ? die+0x2a/0x50
+   ? do_trap+0x85/0x100
+   ? pick_next_task_rt+0x6e/0x1d0
+   ? do_error_trap+0x64/0xa0
+   ? pick_next_task_rt+0x6e/0x1d0
+   ? exc_invalid_op+0x4c/0x60
+   ? pick_next_task_rt+0x6e/0x1d0
+   ? asm_exc_invalid_op+0x12/0x20
+   ? pick_next_task_rt+0x6e/0x1d0
+   __schedule+0x5cb/0x790
+   ? update_ts_time_stats+0x55/0x70
+   schedule_idle+0x1e/0x40
+   do_idle+0x15e/0x200
+   cpu_startup_entry+0x19/0x20
+   start_secondary+0x117/0x160
+   secondary_startup_64_no_verify+0xb0/0xbb
+
+-> BUG: kernel NULL pointer dereference, address: 00000000000000c0
+   Call Trace:
+   ? __die_body+0x1a/0x60
+   ? no_context+0x183/0x350
+   ? __warn+0x8a/0xe0
+   ? exc_page_fault+0x3d6/0x520
+   ? asm_exc_page_fault+0x1e/0x30
+   ? pick_next_task_rt+0xb5/0x1d0
+   ? pick_next_task_rt+0x8c/0x1d0
+   __schedule+0x583/0x7e0
+   ? update_ts_time_stats+0x55/0x70
+   schedule_idle+0x1e/0x40
+   do_idle+0x15e/0x200
+   cpu_startup_entry+0x19/0x20
+   start_secondary+0x117/0x160
+   secondary_startup_64_no_verify+0xb0/0xbb
+
+-> BUG: unable to handle page fault for address: ffff9464daea5900
+   kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p))
+
+-> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running)
+   Call Trace:
+   ? __die_body+0x1a/0x60
+   ? die+0x2a/0x50
+   ? do_trap+0x85/0x100
+   ? dequeue_top_rt_rq+0xa2/0xb0
+   ? do_error_trap+0x64/0xa0
+   ? dequeue_top_rt_rq+0xa2/0xb0
+   ? exc_invalid_op+0x4c/0x60
+   ? dequeue_top_rt_rq+0xa2/0xb0
+   ? asm_exc_invalid_op+0x12/0x20
+   ? dequeue_top_rt_rq+0xa2/0xb0
+   dequeue_rt_entity+0x1f/0x70
+   dequeue_task_rt+0x2d/0x70
+   __schedule+0x1a8/0x7e0
+   ? blk_finish_plug+0x25/0x40
+   schedule+0x3c/0xb0
+   futex_wait_queue_me+0xb6/0x120
+   futex_wait+0xd9/0x240
+   do_futex+0x344/0xa90
+   ? get_mm_exe_file+0x30/0x60
+   ? audit_exe_compare+0x58/0x70
+   ? audit_filter_rules.constprop.26+0x65e/0x1220
+   __x64_sys_futex+0x148/0x1f0
+   do_syscall_64+0x30/0x80
+   entry_SYSCALL_64_after_hwframe+0x62/0xc7
+
+-> BUG: unable to handle page fault for address: ffff8cf3608bc2c0
+   Call Trace:
+   ? __die_body+0x1a/0x60
+   ? no_context+0x183/0x350
+   ? spurious_kernel_fault+0x171/0x1c0
+   ? exc_page_fault+0x3b6/0x520
+   ? plist_check_list+0x15/0x40
+   ? plist_check_list+0x2e/0x40
+   ? asm_exc_page_fault+0x1e/0x30
+   ? _cond_resched+0x15/0x30
+   ? futex_wait_queue_me+0xc8/0x120
+   ? futex_wait+0xd9/0x240
+   ? try_to_wake_up+0x1b8/0x490
+   ? futex_wake+0x78/0x160
+   ? do_futex+0xcd/0xa90
+   ? plist_check_list+0x15/0x40
+   ? plist_check_list+0x2e/0x40
+   ? plist_del+0x6a/0xd0
+   ? plist_check_list+0x15/0x40
+   ? plist_check_list+0x2e/0x40
+   ? dequeue_pushable_task+0x20/0x70
+   ? __schedule+0x382/0x7e0
+   ? asm_sysvec_reschedule_ipi+0xa/0x20
+   ? schedule+0x3c/0xb0
+   ? exit_to_user_mode_prepare+0x9e/0x150
+   ? irqentry_exit_to_user_mode+0x5/0x30
+   ? asm_sysvec_reschedule_ipi+0x12/0x20
+
+Above are some of the common examples of the crashes that were observed
+due to this issue.
+
+Details
+=======
+Let's look at the following scenario to understand this race.
+
+1) CPU A enters push_rt_task
+  a) CPU A has chosen next_task = task p.
+  b) CPU A calls find_lock_lowest_rq(Task p, CPU Z’s rq).
+  c) CPU A identifies CPU X as a destination CPU (X < Z).
+  d) CPU A enters double_lock_balance(CPU Z’s rq, CPU X’s rq).
+  e) Since X is lower than Z, CPU A unlocks CPU Z’s rq. Someone else has
+     locked CPU X’s rq, and thus, CPU A must wait.
+
+2) At CPU Z
+  a) Previous task has completed execution and thus, CPU Z enters
+     schedule, locks its own rq after CPU A releases it.
+  b) CPU Z dequeues previous task and begins executing task p.
+  c) CPU Z unlocks its rq.
+  d) Task p yields the CPU (ex. by doing IO or waiting to acquire a
+     lock) which triggers the schedule function on CPU Z.
+  e) CPU Z enters schedule again, locks its own rq, and dequeues task p.
+  f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq.
+
+3) At CPU B
+  a) CPU B enters try_to_wake_up with input task p.
+  b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates
+     B.state = WAKING.
+  c) CPU B via select_task_rq determines CPU Y as the target CPU.
+
+4) The race
+  a) CPU A acquires CPU X’s lock and relocks CPU Z.
+  b) CPU A reads task p.cpu = Z and incorrectly concludes task p is
+     still on CPU Z.
+  c) CPU A failed to notice task p had been dequeued from CPU Z while
+     CPU A was waiting for locks in double_lock_balance. If CPU A knew
+     that task p had been dequeued, it would return NULL forcing
+     push_rt_task to give up the task p's migration.
+  d) CPU B updates task p.cpu = Y and calls ttwu_queue.
+  e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task
+     p.on_rq = 1.
+  f) CPU B unlocks CPU Y, triggering memory synchronization.
+  g) CPU A reads task p.on_rq = 1, cementing its assumption that task p
+     has not migrated.
+  h) CPU A decides to migrate p to CPU X.
+
+This leads to A dequeuing p from Y's queue and various crashes down the
+line.
+
+Solution
+========
+The solution here is fairly simple. After obtaining the lock (at 4a),
+the check is enhanced to make sure that the task is still at the head of
+the pushable tasks list. If not, then it is anyway not suitable for
+being pushed out.
+
+Testing
+=======
+The fix is tested on a cluster of 3 nodes, where the panics due to this
+are hit every couple of days. A fix similar to this was deployed on such
+cluster and was stable for more than 30 days.
+
+Co-developed-by: Jon Kohler <jon@nutanix.com>
+Signed-off-by: Jon Kohler <jon@nutanix.com>
+Co-developed-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
+Signed-off-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
+Co-developed-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
+Signed-off-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
+Signed-off-by: Harshit Agarwal <harshit@nutanix.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Reviewed-by: Phil Auld <pauld@redhat.com>
+Tested-by: Will Ton <william.ton@nutanix.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250225180553.167995-1-harshit@nutanix.com
+---
+ kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------
+ 1 file changed, 26 insertions(+), 28 deletions(-)
+
+--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
+@@ -1883,6 +1883,27 @@ static int find_lowest_rq(struct task_st
+ 	return -1;
+ }
+ 
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+	struct task_struct *p;
+
+	if (!has_pushable_tasks(rq))
+		return NULL;
+
+	p = plist_first_entry(&rq->rt.pushable_tasks,
+			      struct task_struct, pushable_tasks);
+
+	BUG_ON(rq->cpu != task_cpu(p));
+	BUG_ON(task_current(rq, p));
+	BUG_ON(task_current_donor(rq, p));
+	BUG_ON(p->nr_cpus_allowed <= 1);
+
+	BUG_ON(!task_on_rq_queued(p));
+	BUG_ON(!rt_task(p));
+
+	return p;
+}
+
+ /* Will lock the rq it finds */
+ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
+ {
+@@ -1913,18 +1934,16 @@ static struct rq *find_lock_lowest_rq(st
+ 			/*
+ 			 * We had to unlock the run queue. In
+ 			 * the mean time, task could have
+-			 * migrated already or had its affinity changed.
+-			 * Also make sure that it wasn't scheduled on its rq.
+			 * migrated already or had its affinity changed,
+			 * therefore check if the task is still at the
+			 * head of the pushable tasks list.
+ 			 * It is possible the task was scheduled, set
+ 			 * "migrate_disabled" and then got preempted, so we must
+ 			 * check the task migration disable flag here too.
+ 			 */
+-			if (unlikely(task_rq(task) != rq ||
+			if (unlikely(is_migration_disabled(task) ||
+ 				     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
+-				     task_on_cpu(rq, task) ||
+-				     !rt_task(task) ||
+-				     is_migration_disabled(task) ||
+-				     !task_on_rq_queued(task))) {
+				     task != pick_next_pushable_task(rq))) {
+ 
+ 				double_unlock_balance(rq, lowest_rq);
+ 				lowest_rq = NULL;
+@@ -1944,27 +1963,6 @@ static struct rq *find_lock_lowest_rq(st
+ 	return lowest_rq;
+ }
+ 
+-static struct task_struct *pick_next_pushable_task(struct rq *rq)
+-{
+-	struct task_struct *p;
+-
+-	if (!has_pushable_tasks(rq))
+-		return NULL;
+-
+-	p = plist_first_entry(&rq->rt.pushable_tasks,
+-			      struct task_struct, pushable_tasks);
+-
+-	BUG_ON(rq->cpu != task_cpu(p));
+-	BUG_ON(task_current(rq, p));
+-	BUG_ON(task_current_donor(rq, p));
+-	BUG_ON(p->nr_cpus_allowed <= 1);
+-
+-	BUG_ON(!task_on_rq_queued(p));
+-	BUG_ON(!rt_task(p));
+-
+-	return p;
+-}
+-
+ /*
+  * If the current CPU has more than one RT task, see if the non
+  * running task can migrate over to a CPU that is running a task
--- a/debian/patches/patchset-pf/fixes/0027-sched-fair-Adhere-to-place_entity-constraints.patch
+++ b/debian/patches/patchset-pf/fixes/0027-sched-fair-Adhere-to-place_entity-constraints.patch
@@ -0,0 +1,62 @@
+From 14b4658d3fa78b169f36e62e722a076a7c50afd8 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Tue, 28 Jan 2025 15:39:49 +0100
+Subject: sched/fair: Adhere to place_entity() constraints
+
+Mike reports that commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
+placement bug causing scheduling lag") relies on commit 4423af84b297
+("sched/fair: optimize the PLACE_LAG when se->vlag is zero") to not
+trip a WARN in place_entity().
+
+What happens is that the lag of the very last entity is 0 per
+definition -- the average of one element matches the value of that
+element. Therefore place_entity() will match the condition skipping
+the lag adjustment:
+
+  if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
+
+Without the 'se->vlag' condition -- it will attempt to adjust the zero
+lag even though we're inserting into an empty tree.
+
+Notably, we should have failed the 'cfs_rq->nr_queued' condition, but
+don't because they didn't get updated.
+
+Additionally, move update_load_add() after placement() as is
+consistent with other place_entity() users -- this change is
+non-functional, place_entity() does not use cfs_rq->load.
+
+Fixes: 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reported-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/c216eb4ef0e0e0029c600aefc69d56681cee5581.camel@gmx.de
+---
+ kernel/sched/fair.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
+@@ -3808,6 +3808,7 @@ static void reweight_entity(struct cfs_r
+ 		update_entity_lag(cfs_rq, se);
+ 		se->deadline -= se->vruntime;
+ 		se->rel_deadline = 1;
+		cfs_rq->nr_queued--;
+ 		if (!curr)
+ 			__dequeue_entity(cfs_rq, se);
+ 		update_load_sub(&cfs_rq->load, se->load.weight);
+@@ -3834,10 +3835,11 @@ static void reweight_entity(struct cfs_r
+ 
+ 	enqueue_load_avg(cfs_rq, se);
+ 	if (se->on_rq) {
+-		update_load_add(&cfs_rq->load, se->load.weight);
+ 		place_entity(cfs_rq, se, 0);
+		update_load_add(&cfs_rq->load, se->load.weight);
+ 		if (!curr)
+ 			__enqueue_entity(cfs_rq, se);
+		cfs_rq->nr_queued++;
+ 
+ 		/*
+ 		 * The entity's vruntime has been adjusted, so let's check
--- a/debian/patches/patchset-pf/fixes/0028-alloc_tag-handle-module-codetag-load-errors-as-modul.patch
+++ b/debian/patches/patchset-pf/fixes/0028-alloc_tag-handle-module-codetag-load-errors-as-modul.patch
@@ -0,0 +1,184 @@
+From 65419a1e04de111460c4f38c47f1db39e71c3357 Mon Sep 17 00:00:00 2001
+From: Suren Baghdasaryan <surenb@google.com>
+Date: Wed, 21 May 2025 09:06:02 -0700
+Subject: alloc_tag: handle module codetag load errors as module load failures
+
+Failures inside codetag_load_module() are currently ignored.  As a result
+an error there would not cause a module load failure and freeing of the
+associated resources.  Correct this behavior by propagating the error code
+to the caller and handling possible errors.  With this change, error to
+allocate percpu counters, which happens at this stage, will not be ignored
+and will cause a module load failure and freeing of resources.  With this
+change we also do not need to disable memory allocation profiling when
+this error happens, instead we fail to load the module.
+
+Link: https://lkml.kernel.org/r/20250521160602.1940771-1-surenb@google.com
+Fixes: 10075262888b ("alloc_tag: allocate percpu counters for module tags dynamically")
+Signed-off-by: Suren Baghdasaryan <surenb@google.com>
+Reported-by: Casey Chen <cachen@purestorage.com>
+Closes: https://lore.kernel.org/all/20250520231620.15259-1-cachen@purestorage.com/
+Cc: Daniel Gomez <da.gomez@samsung.com>
+Cc: David Wang <00107082@163.com>
+Cc: Kent Overstreet <kent.overstreet@linux.dev>
+Cc: Luis Chamberalin <mcgrof@kernel.org>
+Cc: Petr Pavlu <petr.pavlu@suse.com>
+Cc: Sami Tolvanen <samitolvanen@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ include/linux/codetag.h |  8 ++++----
+ kernel/module/main.c    |  5 +++--
+ lib/alloc_tag.c         | 12 +++++++-----
+ lib/codetag.c           | 34 +++++++++++++++++++++++++---------
+ 4 files changed, 39 insertions(+), 20 deletions(-)
+
+--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
+@@ -36,8 +36,8 @@ union codetag_ref {
+ struct codetag_type_desc {
+ 	const char *section;
+ 	size_t tag_size;
+-	void (*module_load)(struct module *mod,
+-			    struct codetag *start, struct codetag *end);
+	int (*module_load)(struct module *mod,
+			   struct codetag *start, struct codetag *end);
+ 	void (*module_unload)(struct module *mod,
+ 			      struct codetag *start, struct codetag *end);
+ #ifdef CONFIG_MODULES
+@@ -89,7 +89,7 @@ void *codetag_alloc_module_section(struc
+ 				   unsigned long align);
+ void codetag_free_module_sections(struct module *mod);
+ void codetag_module_replaced(struct module *mod, struct module *new_mod);
+-void codetag_load_module(struct module *mod);
+int codetag_load_module(struct module *mod);
+ void codetag_unload_module(struct module *mod);
+ 
+ #else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
+@@ -103,7 +103,7 @@ codetag_alloc_module_section(struct modu
+ 			     unsigned long align) { return NULL; }
+ static inline void codetag_free_module_sections(struct module *mod) {}
+ static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {}
+-static inline void codetag_load_module(struct module *mod) {}
+static inline int codetag_load_module(struct module *mod) { return 0; }
+ static inline void codetag_unload_module(struct module *mod) {}
+ 
+ #endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
+--- a/kernel/module/main.c
+++ b/kernel/module/main.c
+@@ -3399,11 +3399,12 @@ static int load_module(struct load_info
+ 			goto sysfs_cleanup;
+ 	}
+ 
+	if (codetag_load_module(mod))
+		goto sysfs_cleanup;
+
+ 	/* Get rid of temporary copy. */
+ 	free_copy(info, flags);
+ 
+-	codetag_load_module(mod);
+-
+ 	/* Done! */
+ 	trace_module_load(mod);
+ 
+--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
+@@ -618,15 +618,16 @@ out:
+ 	mas_unlock(&mas);
+ }
+ 
+-static void load_module(struct module *mod, struct codetag *start, struct codetag *stop)
+static int load_module(struct module *mod, struct codetag *start, struct codetag *stop)
+ {
+ 	/* Allocate module alloc_tag percpu counters */
+ 	struct alloc_tag *start_tag;
+ 	struct alloc_tag *stop_tag;
+ 	struct alloc_tag *tag;
+ 
+	/* percpu counters for core allocations are already statically allocated */
+ 	if (!mod)
+-		return;
+		return 0;
+ 
+ 	start_tag = ct_to_alloc_tag(start);
+ 	stop_tag = ct_to_alloc_tag(stop);
+@@ -638,12 +639,13 @@ static void load_module(struct module *m
+ 				free_percpu(tag->counters);
+ 				tag->counters = NULL;
+ 			}
+-			shutdown_mem_profiling(true);
+-			pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s. Memory allocation profiling is disabled!\n",
+			pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n",
+ 			       mod->name);
+-			break;
+			return -ENOMEM;
+ 		}
+ 	}
+
+	return 0;
+ }
+ 
+ static void replace_module(struct module *mod, struct module *new_mod)
+--- a/lib/codetag.c
+++ b/lib/codetag.c
+@@ -167,6 +167,7 @@ static int codetag_module_init(struct co
+ {
+ 	struct codetag_range range;
+ 	struct codetag_module *cmod;
+	int mod_id;
+ 	int err;
+ 
+ 	range = get_section_range(mod, cttype->desc.section);
+@@ -190,11 +191,20 @@ static int codetag_module_init(struct co
+ 	cmod->range = range;
+ 
+ 	down_write(&cttype->mod_lock);
+-	err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
+-	if (err >= 0) {
+-		cttype->count += range_size(cttype, &range);
+-		if (cttype->desc.module_load)
+-			cttype->desc.module_load(mod, range.start, range.stop);
+	mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
+	if (mod_id >= 0) {
+		if (cttype->desc.module_load) {
+			err = cttype->desc.module_load(mod, range.start, range.stop);
+			if (!err)
+				cttype->count += range_size(cttype, &range);
+			else
+				idr_remove(&cttype->mod_idr, mod_id);
+		} else {
+			cttype->count += range_size(cttype, &range);
+			err = 0;
+		}
+	} else {
+		err = mod_id;
+ 	}
+ 	up_write(&cttype->mod_lock);
+ 
+@@ -295,17 +305,23 @@ void codetag_module_replaced(struct modu
+ 	mutex_unlock(&codetag_lock);
+ }
+ 
+-void codetag_load_module(struct module *mod)
+int codetag_load_module(struct module *mod)
+ {
+ 	struct codetag_type *cttype;
+	int ret = 0;
+ 
+ 	if (!mod)
+-		return;
+		return 0;
+ 
+ 	mutex_lock(&codetag_lock);
+-	list_for_each_entry(cttype, &codetag_types, link)
+-		codetag_module_init(cttype, mod);
+	list_for_each_entry(cttype, &codetag_types, link) {
+		ret = codetag_module_init(cttype, mod);
+		if (ret)
+			break;
+	}
+ 	mutex_unlock(&codetag_lock);
+
+	return ret;
+ }
+ 
+ void codetag_unload_module(struct module *mod)
--- a/debian/patches/patchset-pf/fixes/0029-svcrdma-Unregister-the-device-if-svc_rdma_accept-fai.patch
+++ b/debian/patches/patchset-pf/fixes/0029-svcrdma-Unregister-the-device-if-svc_rdma_accept-fai.patch
@@ -0,0 +1,29 @@
+From 3848ddd6068c425b732da6e8c78b047ed28c6114 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Sun, 27 Apr 2025 12:39:59 -0400
+Subject: svcrdma: Unregister the device if svc_rdma_accept() fails
+
+To handle device removal, svc_rdma_accept() requests removal
+notification for the underlying device when accepting a connection.
+However svc_rdma_free() is not invoked if svc_rdma_accept() fails.
+There needs to be a matching "unregister" in that case; otherwise
+the device cannot be removed.
+
+Fixes: c4de97f7c454 ("svcrdma: Handle device removal outside of the CM event handler")
+Cc: stable@vger.kernel.org
+Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+@@ -575,6 +575,7 @@ static struct svc_xprt *svc_rdma_accept(
+ 	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
+ 		ib_destroy_qp(newxprt->sc_qp);
+ 	rdma_destroy_id(newxprt->sc_cm_id);
+	rpcrdma_rn_unregister(dev, &newxprt->sc_rn);
+ 	/* This call to put will destroy the transport */
+ 	svc_xprt_put(&newxprt->sc_xprt);
+ 	return NULL;
--- a/debian/patches/patchset-pf/fixes/0030-SUNRPC-Prevent-hang-on-NFS-mount-with-xprtsec-m-tls.patch
+++ b/debian/patches/patchset-pf/fixes/0030-SUNRPC-Prevent-hang-on-NFS-mount-with-xprtsec-m-tls.patch
@@ -0,0 +1,53 @@
+From 38b409dd5c2fd9496fde05db4fb538a7e3593922 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 21 May 2025 16:34:13 -0400
+Subject: SUNRPC: Prevent hang on NFS mount with xprtsec=[m]tls
+
+Engineers at Hammerspace noticed that sometimes mounting with
+"xprtsec=tls" hangs for a minute or so, and then times out, even
+when the NFS server is reachable and responsive.
+
+kTLS shuts off data_ready callbacks if strp->msg_ready is set to
+mitigate data_ready callbacks when a full TLS record is not yet
+ready to be read from the socket.
+
+Normally msg_ready is clear when the first TLS record arrives on
+a socket. However, I observed that sometimes tls_setsockopt() sets
+strp->msg_ready, and that prevents forward progress because
+tls_data_ready() becomes a no-op.
+
+Moreover, Jakub says: "If there's a full record queued at the time
+when [tlshd] passes the socket back to the kernel, it's up to the
+reader to read the already queued data out." So SunRPC cannot
+expect a data_ready call when ingress data is already waiting.
+
+Add an explicit poll after SunRPC's upper transport is set up to
+pick up any data that arrived after the TLS handshake but before
+transport set-up is complete.
+
+Reported-by: Steve Sears <sjs@hammerspace.com>
+Suggested-by: Jakub Kacinski <kuba@kernel.org>
+Fixes: 75eb6af7acdf ("SUNRPC: Add a TCP-with-TLS RPC transport class")
+Tested-by: Mike Snitzer <snitzer@kernel.org>
+Reviewed-by: Mike Snitzer <snitzer@kernel.org>
+Cc: stable@vger.kernel.org
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
+---
+ net/sunrpc/xprtsock.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
+@@ -2740,6 +2740,11 @@ static void xs_tcp_tls_setup_socket(stru
+ 	}
+ 	rpc_shutdown_client(lower_clnt);
+ 
+	/* Check for ingress data that arrived before the socket's
+	 * ->data_ready callback was set up.
+	 */
+	xs_poll_check_readable(upper_transport);
+
+ out_unlock:
+ 	current_restore_flags(pflags, PF_MEMALLOC);
+ 	upper_transport->clnt = NULL;
--- a/debian/patches/patchset-pf/fixes/0031-hv_netvsc-fix-potential-deadlock-in-netvsc_vf_setxdp.patch
+++ b/debian/patches/patchset-pf/fixes/0031-hv_netvsc-fix-potential-deadlock-in-netvsc_vf_setxdp.patch
@@ -0,0 +1,89 @@
+From c3e0e5bd29d97f8e5663026e8c2f25e08f1c4544 Mon Sep 17 00:00:00 2001
+From: Saurabh Sengar <ssengar@linux.microsoft.com>
+Date: Thu, 29 May 2025 03:18:30 -0700
+Subject: hv_netvsc: fix potential deadlock in netvsc_vf_setxdp()
+
+The MANA driver's probe registers netdevice via the following call chain:
+
+mana_probe()
+  register_netdev()
+    register_netdevice()
+
+register_netdevice() calls notifier callback for netvsc driver,
+holding the netdev mutex via netdev_lock_ops().
+
+Further this netvsc notifier callback end up attempting to acquire the
+same lock again in dev_xdp_propagate() leading to deadlock.
+
+netvsc_netdev_event()
+  netvsc_vf_setxdp()
+    dev_xdp_propagate()
+
+This deadlock was not observed so far because net_shaper_ops was never set,
+and thus the lock was effectively a no-op in this case. Fix this by using
+netif_xdp_propagate() instead of dev_xdp_propagate() to avoid recursive
+locking in this path.
+
+And, since no deadlock is observed on the other path which is via
+netvsc_probe, add the lock exclusivly for that path.
+
+Also, clean up the unregistration path by removing the unnecessary call to
+netvsc_vf_setxdp(), since unregister_netdevice_many_notify() already
+performs this cleanup via dev_xdp_uninstall().
+
+Fixes: 97246d6d21c2 ("net: hold netdev instance lock during ndo_bpf")
+Cc: stable@vger.kernel.org
+Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
+Tested-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
+Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
+Reviewed-by: Subbaraya Sundeep <sbhatta@marvell.com>
+Link: https://patch.msgid.link/1748513910-23963-1-git-send-email-ssengar@linux.microsoft.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+---
+ drivers/net/hyperv/netvsc_bpf.c | 2 +-
+ drivers/net/hyperv/netvsc_drv.c | 4 ++--
+ net/core/dev.c                  | 1 +
+ 3 files changed, 4 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/hyperv/netvsc_bpf.c
+++ b/drivers/net/hyperv/netvsc_bpf.c
+@@ -183,7 +183,7 @@ int netvsc_vf_setxdp(struct net_device *
+ 	xdp.command = XDP_SETUP_PROG;
+ 	xdp.prog = prog;
+ 
+-	ret = dev_xdp_propagate(vf_netdev, &xdp);
+	ret = netif_xdp_propagate(vf_netdev, &xdp);
+ 
+ 	if (ret && prog)
+ 		bpf_prog_put(prog);
+--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
+@@ -2462,8 +2462,6 @@ static int netvsc_unregister_vf(struct n
+ 
+ 	netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
+ 
+-	netvsc_vf_setxdp(vf_netdev, NULL);
+-
+ 	reinit_completion(&net_device_ctx->vf_add);
+ 	netdev_rx_handler_unregister(vf_netdev);
+ 	netdev_upper_dev_unlink(vf_netdev, ndev);
+@@ -2631,7 +2629,9 @@ static int netvsc_probe(struct hv_device
+ 			continue;
+ 
+ 		netvsc_prepare_bonding(vf_netdev);
+		netdev_lock_ops(vf_netdev);
+ 		netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE);
+		netdev_unlock_ops(vf_netdev);
+ 		__netvsc_vf_setup(net, vf_netdev);
+ 		break;
+ 	}
+--- a/net/core/dev.c
+++ b/net/core/dev.c
+@@ -9863,6 +9863,7 @@ int netif_xdp_propagate(struct net_devic
+ 
+ 	return dev->netdev_ops->ndo_bpf(dev, bpf);
+ }
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
+ 
+ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
+ {
--- a/debian/patches/patchset-pf/fixes/0032-net-clear-the-dst-when-changing-skb-protocol.patch
+++ b/debian/patches/patchset-pf/fixes/0032-net-clear-the-dst-when-changing-skb-protocol.patch
@@ -0,0 +1,113 @@
+From 0f48fca427618cecf6683fa8e46cb8d0b66bb93d Mon Sep 17 00:00:00 2001
+From: Jakub Kicinski <kuba@kernel.org>
+Date: Mon, 9 Jun 2025 17:12:44 -0700
+Subject: net: clear the dst when changing skb protocol
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+A not-so-careful NAT46 BPF program can crash the kernel
+if it indiscriminately flips ingress packets from v4 to v6:
+
+  BUG: kernel NULL pointer dereference, address: 0000000000000000
+    ip6_rcv_core (net/ipv6/ip6_input.c:190:20)
+    ipv6_rcv (net/ipv6/ip6_input.c:306:8)
+    process_backlog (net/core/dev.c:6186:4)
+    napi_poll (net/core/dev.c:6906:9)
+    net_rx_action (net/core/dev.c:7028:13)
+    do_softirq (kernel/softirq.c:462:3)
+    netif_rx (net/core/dev.c:5326:3)
+    dev_loopback_xmit (net/core/dev.c:4015:2)
+    ip_mc_finish_output (net/ipv4/ip_output.c:363:8)
+    NF_HOOK (./include/linux/netfilter.h:314:9)
+    ip_mc_output (net/ipv4/ip_output.c:400:5)
+    dst_output (./include/net/dst.h:459:9)
+    ip_local_out (net/ipv4/ip_output.c:130:9)
+    ip_send_skb (net/ipv4/ip_output.c:1496:8)
+    udp_send_skb (net/ipv4/udp.c:1040:8)
+    udp_sendmsg (net/ipv4/udp.c:1328:10)
+
+The output interface has a 4->6 program attached at ingress.
+We try to loop the multicast skb back to the sending socket.
+Ingress BPF runs as part of netif_rx(), pushes a valid v6 hdr
+and changes skb->protocol to v6. We enter ip6_rcv_core which
+tries to use skb_dst(). But the dst is still an IPv4 one left
+after IPv4 mcast output.
+
+Clear the dst in all BPF helpers which change the protocol.
+Try to preserve metadata dsts, those may carry non-routing
+metadata.
+
+Cc: stable@vger.kernel.org
+Reviewed-by: Maciej Żenczykowski <maze@google.com>
+Acked-by: Daniel Borkmann <daniel@iogearbox.net>
+Fixes: d219df60a70e ("bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()")
+Fixes: 1b00e0dfe7d0 ("bpf: update skb->protocol in bpf_skb_net_grow")
+Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Link: https://patch.msgid.link/20250610001245.1981782-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+---
+ net/core/filter.c | 19 +++++++++++++------
+ 1 file changed, 13 insertions(+), 6 deletions(-)
+
+--- a/net/core/filter.c
+++ b/net/core/filter.c
+@@ -3232,6 +3232,13 @@ static const struct bpf_func_proto bpf_s
+ 	.arg1_type      = ARG_PTR_TO_CTX,
+ };
+ 
+static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
+{
+	skb->protocol = htons(proto);
+	if (skb_valid_dst(skb))
+		skb_dst_drop(skb);
+}
+
+ static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
+ {
+ 	/* Caller already did skb_cow() with len as headroom,
+@@ -3328,7 +3335,7 @@ static int bpf_skb_proto_4_to_6(struct s
+ 		}
+ 	}
+ 
+-	skb->protocol = htons(ETH_P_IPV6);
+	bpf_skb_change_protocol(skb, ETH_P_IPV6);
+ 	skb_clear_hash(skb);
+ 
+ 	return 0;
+@@ -3358,7 +3365,7 @@ static int bpf_skb_proto_6_to_4(struct s
+ 		}
+ 	}
+ 
+-	skb->protocol = htons(ETH_P_IP);
+	bpf_skb_change_protocol(skb, ETH_P_IP);
+ 	skb_clear_hash(skb);
+ 
+ 	return 0;
+@@ -3549,10 +3556,10 @@ static int bpf_skb_net_grow(struct sk_bu
+ 		/* Match skb->protocol to new outer l3 protocol */
+ 		if (skb->protocol == htons(ETH_P_IP) &&
+ 		    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+-			skb->protocol = htons(ETH_P_IPV6);
+			bpf_skb_change_protocol(skb, ETH_P_IPV6);
+ 		else if (skb->protocol == htons(ETH_P_IPV6) &&
+ 			 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
+-			skb->protocol = htons(ETH_P_IP);
+			bpf_skb_change_protocol(skb, ETH_P_IP);
+ 	}
+ 
+ 	if (skb_is_gso(skb)) {
+@@ -3605,10 +3612,10 @@ static int bpf_skb_net_shrink(struct sk_
+ 	/* Match skb->protocol to new outer l3 protocol */
+ 	if (skb->protocol == htons(ETH_P_IP) &&
+ 	    flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
+-		skb->protocol = htons(ETH_P_IPV6);
+		bpf_skb_change_protocol(skb, ETH_P_IPV6);
+ 	else if (skb->protocol == htons(ETH_P_IPV6) &&
+ 		 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
+-		skb->protocol = htons(ETH_P_IP);
+		bpf_skb_change_protocol(skb, ETH_P_IP);
+ 
+ 	if (skb_is_gso(skb)) {
+ 		struct skb_shared_info *shinfo = skb_shinfo(skb);
--- a/debian/patches/patchset-pf/fixes/0033-net_sched-sch_sfq-reject-invalid-perturb-period.patch
+++ b/debian/patches/patchset-pf/fixes/0033-net_sched-sch_sfq-reject-invalid-perturb-period.patch
@@ -0,0 +1,67 @@
+From 59765af017c206b162b2ceb8d56a171e40a17719 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 11 Jun 2025 08:35:01 +0000
+Subject: net_sched: sch_sfq: reject invalid perturb period
+
+Gerrard Tai reported that SFQ perturb_period has no range check yet,
+and this can be used to trigger a race condition fixed in a separate patch.
+
+We want to make sure ctl->perturb_period * HZ will not overflow
+and is positive.
+
+Tested:
+
+tc qd add dev lo root sfq perturb -10   # negative value : error
+Error: sch_sfq: invalid perturb period.
+
+tc qd add dev lo root sfq perturb 1000000000 # too big : error
+Error: sch_sfq: invalid perturb period.
+
+tc qd add dev lo root sfq perturb 2000000 # acceptable value
+tc -s -d qd sh dev lo
+qdisc sfq 8005: root refcnt 2 limit 127p quantum 64Kb depth 127 flows 128 divisor 1024 perturb 2000000sec
+ Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
+ backlog 0b 0p requeues 0
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Reported-by: Gerrard Tai <gerrard.tai@starlabs.sg>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: stable@vger.kernel.org
+Link: https://patch.msgid.link/20250611083501.1810459-1-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+---
+ net/sched/sch_sfq.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
+@@ -653,6 +653,14 @@ static int sfq_change(struct Qdisc *sch,
+ 		NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
+ 		return -EINVAL;
+ 	}
+
+	if (ctl->perturb_period < 0 ||
+	    ctl->perturb_period > INT_MAX / HZ) {
+		NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
+		return -EINVAL;
+	}
+	perturb_period = ctl->perturb_period * HZ;
+
+ 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
+ 					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
+ 		return -EINVAL;
+@@ -669,14 +677,12 @@ static int sfq_change(struct Qdisc *sch,
+ 	headdrop = q->headdrop;
+ 	maxdepth = q->maxdepth;
+ 	maxflows = q->maxflows;
+-	perturb_period = q->perturb_period;
+ 	quantum = q->quantum;
+ 	flags = q->flags;
+ 
+ 	/* update and validate configuration */
+ 	if (ctl->quantum)
+ 		quantum = ctl->quantum;
+-	perturb_period = ctl->perturb_period * HZ;
+ 	if (ctl->flows)
+ 		maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+ 	if (ctl->divisor) {
--- a/debian/patches/patchset-pf/fixes/0034-posix-cpu-timers-fix-race-between-handle_posix_cpu_t.patch
+++ b/debian/patches/patchset-pf/fixes/0034-posix-cpu-timers-fix-race-between-handle_posix_cpu_t.patch
@@ -0,0 +1,51 @@
+From b504e1cd491c55390370059280d5fbaa045d5543 Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Fri, 13 Jun 2025 19:26:50 +0200
+Subject: posix-cpu-timers: fix race between handle_posix_cpu_timers() and
+ posix_cpu_timer_del()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+If an exiting non-autoreaping task has already passed exit_notify() and
+calls handle_posix_cpu_timers() from IRQ, it can be reaped by its parent
+or debugger right after unlock_task_sighand().
+
+If a concurrent posix_cpu_timer_del() runs at that moment, it won't be
+able to detect timer->it.cpu.firing != 0: cpu_timer_task_rcu() and/or
+lock_task_sighand() will fail.
+
+Add the tsk->exit_state check into run_posix_cpu_timers() to fix this.
+
+This fix is not needed if CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y, because
+exit_task_work() is called before exit_notify(). But the check still
+makes sense, task_work_add(&tsk->posix_cputimers_work.work) will fail
+anyway in this case.
+
+Cc: stable@vger.kernel.org
+Reported-by: Benoît Sevens <bsevens@google.com>
+Fixes: 0bdd2ed4138e ("sched: run_posix_cpu_timers: Don't check ->exit_state, use lock_task_sighand()")
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+---
+ kernel/time/posix-cpu-timers.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
+@@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void)
+ 	lockdep_assert_irqs_disabled();
+ 
+ 	/*
+	 * Ensure that release_task(tsk) can't happen while
+	 * handle_posix_cpu_timers() is running. Otherwise, a concurrent
+	 * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
+	 * miss timer->it.cpu.firing != 0.
+	 */
+	if (tsk->exit_state)
+		return;
+
+	/*
+ 	 * If the actual expiry is deferred to task work context and the
+ 	 * work is already scheduled there is no point to do anything here.
+ 	 */
--- a/debian/patches/patchset-pf/fixes/0035-mm-vma-reset-VMA-iterator-on-commit_merge-OOM-failur.patch
+++ b/debian/patches/patchset-pf/fixes/0035-mm-vma-reset-VMA-iterator-on-commit_merge-OOM-failur.patch
@@ -0,0 +1,93 @@
+From d7b5f2aa34c56bd2a2d3cda2a7eb7aeb24df6179 Mon Sep 17 00:00:00 2001
+From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Date: Fri, 6 Jun 2025 13:50:32 +0100
+Subject: mm/vma: reset VMA iterator on commit_merge() OOM failure
+
+While an OOM failure in commit_merge() isn't really feasible due to the
+allocation which might fail (a maple tree pre-allocation) being 'too small
+to fail', we do need to handle this case correctly regardless.
+
+In vma_merge_existing_range(), we can theoretically encounter failures
+which result in an OOM error in two ways - firstly dup_anon_vma() might
+fail with an OOM error, and secondly commit_merge() failing, ultimately,
+to pre-allocate a maple tree node.
+
+The abort logic for dup_anon_vma() resets the VMA iterator to the initial
+range, ensuring that any logic looping on this iterator will correctly
+proceed to the next VMA.
+
+However the commit_merge() abort logic does not do the same thing.  This
+resulted in a syzbot report occurring because mlockall() iterates through
+VMAs, is tolerant of errors, but ended up with an incorrect previous VMA
+being specified due to incorrect iterator state.
+
+While making this change, it became apparent we are duplicating logic -
+the logic introduced in commit 41e6ddcaa0f1 ("mm/vma: add give_up_on_oom
+option on modify/merge, use in uffd release") duplicates the
+vmg->give_up_on_oom check in both abort branches.
+
+Additionally, we observe that we can perform the anon_dup check safely on
+dup_anon_vma() failure, as this will not be modified should this call
+fail.
+
+Finally, we need to reset the iterator in both cases, so now we can simply
+use the exact same code to abort for both.
+
+We remove the VM_WARN_ON(err != -ENOMEM) as it would be silly for this to
+be otherwise and it allows us to implement the abort check more neatly.
+
+Link: https://lkml.kernel.org/r/20250606125032.164249-1-lorenzo.stoakes@oracle.com
+Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure")
+Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Reported-by: syzbot+d16409ea9ecc16ed261a@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/linux-mm/6842cc67.a00a0220.29ac89.003b.GAE@google.com/
+Reviewed-by: Pedro Falcato <pfalcato@suse.de>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ mm/vma.c | 22 ++++------------------
+ 1 file changed, 4 insertions(+), 18 deletions(-)
+
+--- a/mm/vma.c
+++ b/mm/vma.c
+@@ -927,26 +927,9 @@ static __must_check struct vm_area_struc
+ 		err = dup_anon_vma(next, middle, &anon_dup);
+ 	}
+ 
+-	if (err)
+	if (err || commit_merge(vmg))
+ 		goto abort;
+ 
+-	err = commit_merge(vmg);
+-	if (err) {
+-		VM_WARN_ON(err != -ENOMEM);
+-
+-		if (anon_dup)
+-			unlink_anon_vmas(anon_dup);
+-
+-		/*
+-		 * We've cleaned up any cloned anon_vma's, no VMAs have been
+-		 * modified, no harm no foul if the user requests that we not
+-		 * report this and just give up, leaving the VMAs unmerged.
+-		 */
+-		if (!vmg->give_up_on_oom)
+-			vmg->state = VMA_MERGE_ERROR_NOMEM;
+-		return NULL;
+-	}
+-
+ 	khugepaged_enter_vma(vmg->target, vmg->flags);
+ 	vmg->state = VMA_MERGE_SUCCESS;
+ 	return vmg->target;
+@@ -955,6 +938,9 @@ abort:
+ 	vma_iter_set(vmg->vmi, start);
+ 	vma_iter_load(vmg->vmi);
+ 
+	if (anon_dup)
+		unlink_anon_vmas(anon_dup);
+
+ 	/*
+ 	 * This means we have failed to clone anon_vma's correctly, but no
+ 	 * actual changes to VMAs have occurred, so no harm no foul - if the
--- a/debian/patches/patchset-pf/fixes/0036-mm-close-theoretical-race-where-stale-TLB-entries-co.patch
+++ b/debian/patches/patchset-pf/fixes/0036-mm-close-theoretical-race-where-stale-TLB-entries-co.patch
@@ -0,0 +1,90 @@
+From db96fe27668a3bb56fa5d745d1c2eed49a95a56f Mon Sep 17 00:00:00 2001
+From: Ryan Roberts <ryan.roberts@arm.com>
+Date: Fri, 6 Jun 2025 10:28:07 +0100
+Subject: mm: close theoretical race where stale TLB entries could linger
+
+Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a
+parallel reclaim leaving stale TLB entries") described a theoretical race
+as such:
+
+
+"""
+Nadav Amit identified a theoretical race between page reclaim and mprotect
+due to TLB flushes being batched outside of the PTL being held.
+
+He described the race as follows:
+
+	CPU0                            CPU1
+	----                            ----
+					user accesses memory using RW PTE
+					[PTE now cached in TLB]
+	try_to_unmap_one()
+	==> ptep_get_and_clear()
+	==> set_tlb_ubc_flush_pending()
+					mprotect(addr, PROT_READ)
+					==> change_pte_range()
+					==> [ PTE non-present - no flush ]
+
+					user writes using cached RW PTE
+	...
+
+	try_to_unmap_flush()
+
+The same type of race exists for reads when protecting for PROT_NONE and
+also exists for operations that can leave an old TLB entry behind such as
+munmap, mremap and madvise.
+"""
+
+The solution was to introduce flush_tlb_batched_pending() and call it
+under the PTL from mprotect/madvise/munmap/mremap to complete any pending
+tlb flushes.
+
+However, while madvise_free_pte_range() and
+madvise_cold_or_pageout_pte_range() were both retro-fitted to call
+flush_tlb_batched_pending() immediately after initially acquiring the PTL,
+they both temporarily release the PTL to split a large folio if they
+stumble upon one.  In this case, where re-acquiring the PTL
+flush_tlb_batched_pending() must be called again, but it previously was
+not.  Let's fix that.
+
+There are 2 Fixes: tags here: the first is the commit that fixed
+madvise_free_pte_range().  The second is the commit that added
+madvise_cold_or_pageout_pte_range(), which looks like it copy/pasted the
+faulty pattern from madvise_free_pte_range().
+
+This is a theoretical bug discovered during code review.
+
+Link: https://lkml.kernel.org/r/20250606092809.4194056-1-ryan.roberts@arm.com
+Fixes: 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries")
+Fixes: 9c276cc65a58 ("mm: introduce MADV_COLD")
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: Jann Horn <jannh@google.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Liam Howlett <liam.howlett@oracle.com>
+Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+Cc: Mel Gorman <mgorman <mgorman@suse.de>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ mm/madvise.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/mm/madvise.c
+++ b/mm/madvise.c
+@@ -503,6 +503,7 @@ restart:
+ 					pte_offset_map_lock(mm, pmd, addr, &ptl);
+ 				if (!start_pte)
+ 					break;
+				flush_tlb_batched_pending(mm);
+ 				arch_enter_lazy_mmu_mode();
+ 				if (!err)
+ 					nr = 0;
+@@ -736,6 +737,7 @@ static int madvise_free_pte_range(pmd_t
+ 				start_pte = pte;
+ 				if (!start_pte)
+ 					break;
+				flush_tlb_batched_pending(mm);
+ 				arch_enter_lazy_mmu_mode();
+ 				if (!err)
+ 					nr = 0;
--- a/debian/patches/patchset-pf/fixes/0037-io_uring-kbuf-don-t-truncate-end-buffer-for-multiple.patch
+++ b/debian/patches/patchset-pf/fixes/0037-io_uring-kbuf-don-t-truncate-end-buffer-for-multiple.patch
@@ -0,0 +1,33 @@
+From f8c6b0801edd6f50057610c67120ffb42027f2c2 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Fri, 13 Jun 2025 11:01:49 -0600
+Subject: io_uring/kbuf: don't truncate end buffer for multiple buffer peeks
+
+If peeking a bunch of buffers, normally io_ring_buffers_peek() will
+truncate the end buffer. This isn't optimal as presumably more data will
+be arriving later, and hence it's better to stop with the last full
+buffer rather than truncate the end buffer.
+
+Cc: stable@vger.kernel.org
+Fixes: 35c8711c8fc4 ("io_uring/kbuf: add helpers for getting/peeking multiple buffers")
+Reported-by: Christian Mazakas <christian.mazakas@gmail.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+---
+ io_uring/kbuf.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
+@@ -270,8 +270,11 @@ static int io_ring_buffers_peek(struct i
+ 		/* truncate end piece, if needed, for non partial buffers */
+ 		if (len > arg->max_len) {
+ 			len = arg->max_len;
+-			if (!(bl->flags & IOBL_INC))
+			if (!(bl->flags & IOBL_INC)) {
+				if (iov != arg->iovs)
+					break;
+ 				buf->len = len;
+			}
+ 		}
+ 
+ 		iov->iov_base = u64_to_user_ptr(buf->addr);
--- a/debian/patches/patchset-pf/fixes/0038-nvme-always-punt-polled-uring_cmd-end_io-work-to-tas.patch
+++ b/debian/patches/patchset-pf/fixes/0038-nvme-always-punt-polled-uring_cmd-end_io-work-to-tas.patch
@@ -0,0 +1,54 @@
+From a2ef8773db38d0c3a41761dbed6fc57afa440161 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Fri, 13 Jun 2025 13:37:41 -0600
+Subject: nvme: always punt polled uring_cmd end_io work to task_work
+
+Currently NVMe uring_cmd completions will complete locally, if they are
+polled. This is done because those completions are always invoked from
+task context. And while that is true, there's no guarantee that it's
+invoked under the right ring context, or even task. If someone does
+NVMe passthrough via multiple threads and with a limited number of
+poll queues, then ringA may find completions from ringB. For that case,
+completing the request may not be sound.
+
+Always just punt the passthrough completions via task_work, which will
+redirect the completion, if needed.
+
+Cc: stable@vger.kernel.org
+Fixes: 585079b6e425 ("nvme: wire up async polling for io passthrough commands")
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+---
+ drivers/nvme/host/ioctl.c | 21 +++++++--------------
+ 1 file changed, 7 insertions(+), 14 deletions(-)
+
+--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
+@@ -429,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd
+ 	pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
+ 
+ 	/*
+-	 * For iopoll, complete it directly. Note that using the uring_cmd
+-	 * helper for this is safe only because we check blk_rq_is_poll().
+-	 * As that returns false if we're NOT on a polled queue, then it's
+-	 * safe to use the polled completion helper.
+-	 *
+-	 * Otherwise, move the completion to task work.
+	 * IOPOLL could potentially complete this request directly, but
+	 * if multiple rings are polling on the same queue, then it's possible
+	 * for one ring to find completions for another ring. Punting the
+	 * completion via task_work will always direct it to the right
+	 * location, rather than potentially complete requests for ringA
+	 * under iopoll invocations from ringB.
+ 	 */
+-	if (blk_rq_is_poll(req)) {
+-		if (pdu->bio)
+-			blk_rq_unmap_user(pdu->bio);
+-		io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
+-	} else {
+-		io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+-	}
+-
+	io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+ 	return RQ_END_IO_FREE;
+ }
+ 
--- a/debian/patches/patchset-pf/fixes/0039-block-Clear-BIO_EMULATES_ZONE_APPEND-flag-on-BIO-com.patch
+++ b/debian/patches/patchset-pf/fixes/0039-block-Clear-BIO_EMULATES_ZONE_APPEND-flag-on-BIO-com.patch
@@ -0,0 +1,33 @@
+From bb51adf56b5adc7075252cd17136c2288c116602 Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <dlemoal@kernel.org>
+Date: Wed, 11 Jun 2025 09:59:15 +0900
+Subject: block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion
+
+When blk_zone_write_plug_bio_endio() is called for a regular write BIO
+used to emulate a zone append operation, that is, a BIO flagged with
+BIO_EMULATES_ZONE_APPEND, the BIO operation code is restored to the
+original REQ_OP_ZONE_APPEND but the BIO_EMULATES_ZONE_APPEND flag is not
+cleared. Clear it to fully return the BIO to its orginal definition.
+
+Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation")
+Cc: stable@vger.kernel.org
+Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Link: https://lore.kernel.org/r/20250611005915.89843-1-dlemoal@kernel.org
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+---
+ block/blk-zoned.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
+@@ -1225,6 +1225,7 @@ void blk_zone_write_plug_bio_endio(struc
+ 	if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
+ 		bio->bi_opf &= ~REQ_OP_MASK;
+ 		bio->bi_opf |= REQ_OP_ZONE_APPEND;
+		bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
+ 	}
+ 
+ 	/*
--- a/debian/patches/patchset-pf/fixes/0040-block-use-plug-request-list-tail-for-one-shot-backme.patch
+++ b/debian/patches/patchset-pf/fixes/0040-block-use-plug-request-list-tail-for-one-shot-backme.patch
@@ -0,0 +1,65 @@
+From 56ae62470a95ac8249c43f5c0d50da2a83c350e0 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Wed, 11 Jun 2025 08:48:46 -0600
+Subject: block: use plug request list tail for one-shot backmerge attempt
+
+Previously, the block layer stored the requests in the plug list in
+LIFO order. For this reason, blk_attempt_plug_merge() would check
+just the head entry for a back merge attempt, and abort after that
+unless requests for multiple queues existed in the plug list. If more
+than one request is present in the plug list, this makes the one-shot
+back merging less useful than before, as it'll always fail to find a
+quick merge candidate.
+
+Use the tail entry for the one-shot merge attempt, which is the last
+added request in the list. If that fails, abort immediately unless
+there are multiple queues available. If multiple queues are available,
+then scan the list. Ideally the latter scan would be a backwards scan
+of the list, but as it currently stands, the plug list is singly linked
+and hence this isn't easily feasible.
+
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/linux-block/20250611121626.7252-1-abuehaze@amazon.com/
+Reported-by: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
+Fixes: e70c301faece ("block: don't reorder requests in blk_add_rq_to_plug")
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+---
+ block/blk-merge.c | 26 +++++++++++++-------------
+ 1 file changed, 13 insertions(+), 13 deletions(-)
+
+--- a/block/blk-merge.c
+++ b/block/blk-merge.c
+@@ -1127,20 +1127,20 @@ bool blk_attempt_plug_merge(struct reque
+ 	if (!plug || rq_list_empty(&plug->mq_list))
+ 		return false;
+ 
+-	rq_list_for_each(&plug->mq_list, rq) {
+-		if (rq->q == q) {
+-			if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+-			    BIO_MERGE_OK)
+-				return true;
+-			break;
+-		}
+	rq = plug->mq_list.tail;
+	if (rq->q == q)
+		return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+			BIO_MERGE_OK;
+	else if (!plug->multiple_queues)
+		return false;
+ 
+-		/*
+-		 * Only keep iterating plug list for merges if we have multiple
+-		 * queues
+-		 */
+-		if (!plug->multiple_queues)
+-			break;
+	rq_list_for_each(&plug->mq_list, rq) {
+		if (rq->q != q)
+			continue;
+		if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+		    BIO_MERGE_OK)
+			return true;
+		break;
+ 	}
+ 	return false;
+ }