1
0

release 6.15.4

This commit is contained in:
2025-06-27 14:17:53 +03:00
parent 4cb7006cd5
commit 8df072e89b
90 changed files with 569 additions and 5084 deletions

View File

@@ -7,7 +7,7 @@ w=$(git rev-parse --path-format=absolute --show-toplevel) ; : "${w:?}" ; cd "$w"
dst='debian/patches/tmp-pf' dst='debian/patches/tmp-pf'
src='../linux-extras' src='../linux-extras'
branches='fixes archlinux cpuidle kbuild nfs smb xfs' branches='fixes archlinux cpuidle exfat kbuild nfs smb xfs'
if [ -d "${dst}" ] ; then rm -rf "${dst}" ; fi if [ -d "${dst}" ] ; then rm -rf "${dst}" ; fi
mkdir -p "${dst}" mkdir -p "${dst}"

7
debian/changelog vendored
View File

@@ -1,3 +1,10 @@
linux (6.15.4-1) sid; urgency=medium
* New upstream stable update:
https://www.kernel.org/pub/linux/kernel/v6.x/ChangeLog-6.15.4
-- Konstantin Demin <rockdrilla@gmail.com> Fri, 27 Jun 2025 14:05:47 +0300
linux (6.15.3-1) sid; urgency=medium linux (6.15.3-1) sid; urgency=medium
* New upstream stable update: * New upstream stable update:

View File

@@ -36,7 +36,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
--- a/kernel/events/core.c --- a/kernel/events/core.c
+++ b/kernel/events/core.c +++ b/kernel/events/core.c
@@ -450,8 +450,13 @@ static struct kmem_cache *perf_event_cac @@ -463,8 +463,13 @@ static struct kmem_cache *perf_event_cac
* 0 - disallow raw tracepoint access for unpriv * 0 - disallow raw tracepoint access for unpriv
* 1 - disallow cpu events for unpriv * 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv * 2 - disallow kernel profiling for unpriv
@@ -50,7 +50,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ /* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
@@ -13110,6 +13115,9 @@ SYSCALL_DEFINE5(perf_event_open, @@ -13144,6 +13149,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (err) if (err)
return err; return err;

View File

@@ -68,7 +68,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
} else if (!strncmp(str, "forcedac", 8)) { } else if (!strncmp(str, "forcedac", 8)) {
pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
iommu_dma_forcedac = true; iommu_dma_forcedac = true;
@@ -1935,6 +1943,9 @@ static int device_def_domain_type(struct @@ -1936,6 +1944,9 @@ static int device_def_domain_type(struct
if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev)) if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
return IOMMU_DOMAIN_IDENTITY; return IOMMU_DOMAIN_IDENTITY;
@@ -78,7 +78,7 @@ Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
} }
return 0; return 0;
@@ -2229,6 +2240,9 @@ static int __init init_dmars(void) @@ -2230,6 +2241,9 @@ static int __init init_dmars(void)
iommu_set_root_entry(iommu); iommu_set_root_entry(iommu);
} }

View File

@@ -18,7 +18,7 @@ Signed-off-by: David Bauer <mail@david-bauer.net>
--- a/net/mac80211/sta_info.c --- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c
@@ -2474,6 +2474,13 @@ static void sta_stats_decode_rate(struct @@ -2467,6 +2467,13 @@ static void sta_stats_decode_rate(struct
sband = local->hw.wiphy->bands[band]; sband = local->hw.wiphy->bands[band];

View File

@@ -28,7 +28,7 @@ Signed-off-by: Johannes Berg <johannes.berg@intel.com>
--- a/net/mac80211/sta_info.c --- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c
@@ -583,6 +583,7 @@ __sta_info_alloc(struct ieee80211_sub_if @@ -582,6 +582,7 @@ __sta_info_alloc(struct ieee80211_sub_if
spin_lock_init(&sta->ps_lock); spin_lock_init(&sta->ps_lock);
INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames);
wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work); wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work);

View File

@@ -23,7 +23,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/net/mac80211/tx.c --- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c +++ b/net/mac80211/tx.c
@@ -4084,7 +4084,7 @@ struct ieee80211_txq *ieee80211_next_txq @@ -4077,7 +4077,7 @@ struct ieee80211_txq *ieee80211_next_txq
if (deficit < 0) if (deficit < 0)
sta->airtime[txqi->txq.ac].deficit += sta->airtime[txqi->txq.ac].deficit +=
@@ -32,7 +32,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
if (deficit < 0 || !aql_check) { if (deficit < 0 || !aql_check) {
list_move_tail(&txqi->schedule_order, list_move_tail(&txqi->schedule_order,
@@ -4227,7 +4227,8 @@ bool ieee80211_txq_may_transmit(struct i @@ -4220,7 +4220,8 @@ bool ieee80211_txq_may_transmit(struct i
} }
sta = container_of(iter->txq.sta, struct sta_info, sta); sta = container_of(iter->txq.sta, struct sta_info, sta);
if (ieee80211_sta_deficit(sta, ac) < 0) if (ieee80211_sta_deficit(sta, ac) < 0)
@@ -42,7 +42,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
list_move_tail(&iter->schedule_order, &local->active_txqs[ac]); list_move_tail(&iter->schedule_order, &local->active_txqs[ac]);
} }
@@ -4235,7 +4236,7 @@ bool ieee80211_txq_may_transmit(struct i @@ -4228,7 +4229,7 @@ bool ieee80211_txq_may_transmit(struct i
if (sta->airtime[ac].deficit >= 0) if (sta->airtime[ac].deficit >= 0)
goto out; goto out;

View File

@@ -95,7 +95,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
spin_lock_init(&local->active_txq_lock[i]); spin_lock_init(&local->active_txq_lock[i]);
--- a/net/mac80211/sta_info.c --- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c
@@ -2388,13 +2388,28 @@ EXPORT_SYMBOL(ieee80211_sta_recalc_aggre @@ -2381,13 +2381,28 @@ EXPORT_SYMBOL(ieee80211_sta_recalc_aggre
void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
struct sta_info *sta, u8 ac, struct sta_info *sta, u8 ac,
@@ -127,7 +127,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
atomic_add(tx_airtime, atomic_add(tx_airtime,
--- a/net/mac80211/tx.c --- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c +++ b/net/mac80211/tx.c
@@ -2556,7 +2556,7 @@ static u16 ieee80211_store_ack_skb(struc @@ -2549,7 +2549,7 @@ static u16 ieee80211_store_ack_skb(struc
spin_lock_irqsave(&local->ack_status_lock, flags); spin_lock_irqsave(&local->ack_status_lock, flags);
id = idr_alloc(&local->ack_status_frames, ack_skb, id = idr_alloc(&local->ack_status_frames, ack_skb,
@@ -136,7 +136,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
spin_unlock_irqrestore(&local->ack_status_lock, flags); spin_unlock_irqrestore(&local->ack_status_lock, flags);
if (id >= 0) { if (id >= 0) {
@@ -3985,20 +3985,20 @@ begin: @@ -3978,20 +3978,20 @@ begin:
encap_out: encap_out:
info->control.vif = vif; info->control.vif = vif;
@@ -167,7 +167,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
} }
return skb; return skb;
@@ -4050,6 +4050,7 @@ struct ieee80211_txq *ieee80211_next_txq @@ -4043,6 +4043,7 @@ struct ieee80211_txq *ieee80211_next_txq
struct ieee80211_txq *ret = NULL; struct ieee80211_txq *ret = NULL;
struct txq_info *txqi = NULL, *head = NULL; struct txq_info *txqi = NULL, *head = NULL;
bool found_eligible_txq = false; bool found_eligible_txq = false;
@@ -175,7 +175,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
spin_lock_bh(&local->active_txq_lock[ac]); spin_lock_bh(&local->active_txq_lock[ac]);
@@ -4073,26 +4074,26 @@ struct ieee80211_txq *ieee80211_next_txq @@ -4066,26 +4067,26 @@ struct ieee80211_txq *ieee80211_next_txq
if (!head) if (!head)
head = txqi; head = txqi;
@@ -214,7 +214,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
if (txqi->schedule_round == local->schedule_round[ac]) if (txqi->schedule_round == local->schedule_round[ac])
goto out; goto out;
@@ -4157,7 +4158,8 @@ bool ieee80211_txq_airtime_check(struct @@ -4150,7 +4151,8 @@ bool ieee80211_txq_airtime_check(struct
return true; return true;
if (!txq->sta) if (!txq->sta)
@@ -224,7 +224,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
if (unlikely(txq->tid == IEEE80211_NUM_TIDS)) if (unlikely(txq->tid == IEEE80211_NUM_TIDS))
return true; return true;
@@ -4206,15 +4208,15 @@ bool ieee80211_txq_may_transmit(struct i @@ -4199,15 +4201,15 @@ bool ieee80211_txq_may_transmit(struct i
spin_lock_bh(&local->active_txq_lock[ac]); spin_lock_bh(&local->active_txq_lock[ac]);

View File

@@ -1,221 +0,0 @@
This reverts commit 484a54c2e597dbc4ace79c1687022282905afba0. The CoDel
parameter change essentially disables CoDel on slow stations, with some
questionable assumptions, as Dave pointed out in [0]. Quoting from
there:
But here are my pithy comments as to why this part of mac80211 is so
wrong...
static void sta_update_codel_params(struct sta_info *sta, u32 thr)
{
- if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) {
1) sta->local->num_sta is the number of associated, rather than
active, stations. "Active" stations in the last 50ms or so, might have
been a better thing to use, but as most people have far more than that
associated, we end up with really lousy codel parameters, all the
time. Mistake numero uno!
2) The STA_SLOW_THRESHOLD was completely arbitrary in 2016.
- sta->cparams.target = MS2TIME(50);
This, by itself, was probably not too bad. 30ms might have been
better, at the time, when we were battling powersave etc, but 20ms was
enough, really, to cover most scenarios, even where we had low rate
2Ghz multicast to cope with. Even then, codel has a hard time finding
any sane drop rate at all, with a target this high.
- sta->cparams.interval = MS2TIME(300);
But this was horrible, a total mistake, that is leading to codel being
completely ineffective in almost any scenario on clients or APS.
100ms, even 80ms, here, would be vastly better than this insanity. I'm
seeing 5+seconds of delay accumulated in a bunch of otherwise happily
fq-ing APs....
100ms of observed jitter during a flow is enough. Certainly (in 2016)
there were interactions with powersave that I did not understand, and
still don't, but if you are transmitting in the first place, powersave
shouldn't be a problemmmm.....
- sta->cparams.ecn = false;
At the time we were pretty nervous about ecn, I'm kind of sanguine
about it now, and reliably indicating ecn seems better than turning it
off for any reason.
[...]
In production, on p2p wireless, I've had 8ms and 80ms for target and
interval for years now, and it works great.
I think Dave's arguments above are basically sound on the face of it,
and various experimentation with tighter CoDel parameters in the OpenWrt
community have show promising results[1]. So I don't think there's any
reason to keep this parameter fiddling; hence this revert.
[0] https://lore.kernel.org/linux-wireless/CAA93jw6NJ2cmLmMauz0xAgC2MGbBq6n0ZiZzAdkK0u4b+O2yXg@mail.gmail.com/
[1] https://forum.openwrt.org/t/reducing-multiplexing-latencies-still-further-in-wifi/133605/130
Suggested-By: Dave Taht <dave.taht@gmail.com>
In-memory-of: Dave Taht <dave.taht@gmail.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5347,22 +5347,6 @@ void ieee80211_get_tx_rates(struct ieee8
int max_rates);
/**
- * ieee80211_sta_set_expected_throughput - set the expected tpt for a station
- *
- * Call this function to notify mac80211 about a change in expected throughput
- * to a station. A driver for a device that does rate control in firmware can
- * call this function when the expected throughput estimate towards a station
- * changes. The information is used to tune the CoDel AQM applied to traffic
- * going towards that station (which can otherwise be too aggressive and cause
- * slow stations to starve).
- *
- * @pubsta: the station to set throughput for.
- * @thr: the current expected throughput in kbps.
- */
-void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta,
- u32 thr);
-
-/**
* ieee80211_tx_rate_update - transmit rate update callback
*
* Drivers should call this functions with a non-NULL pub sta
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -152,12 +152,6 @@ static ssize_t sta_aqm_read(struct file
p += scnprintf(p,
bufsz + buf - p,
- "target %uus interval %uus ecn %s\n",
- codel_time_to_us(sta->cparams.target),
- codel_time_to_us(sta->cparams.interval),
- sta->cparams.ecn ? "yes" : "no");
- p += scnprintf(p,
- bufsz + buf - p,
"tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n");
for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -990,8 +990,6 @@ int rate_control_set_rates(struct ieee80
if (sta->uploaded)
drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta);
- ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta));
-
return 0;
}
EXPORT_SYMBOL(rate_control_set_rates);
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -18,7 +18,6 @@
#include <linux/timer.h>
#include <linux/rtnetlink.h>
-#include <net/codel.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
@@ -702,13 +701,6 @@ __sta_info_alloc(struct ieee80211_sub_if
}
}
- sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD;
- sta->cparams.target = MS2TIME(20);
- sta->cparams.interval = MS2TIME(100);
- sta->cparams.ecn = true;
- sta->cparams.ce_threshold_selector = 0;
- sta->cparams.ce_threshold_mask = 0;
-
sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);
return sta;
@@ -2928,27 +2920,6 @@ unsigned long ieee80211_sta_last_active(
return sta->deflink.status_stats.last_ack;
}
-static void sta_update_codel_params(struct sta_info *sta, u32 thr)
-{
- if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) {
- sta->cparams.target = MS2TIME(50);
- sta->cparams.interval = MS2TIME(300);
- sta->cparams.ecn = false;
- } else {
- sta->cparams.target = MS2TIME(20);
- sta->cparams.interval = MS2TIME(100);
- sta->cparams.ecn = true;
- }
-}
-
-void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta,
- u32 thr)
-{
- struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
-
- sta_update_codel_params(sta, thr);
-}
-
int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -467,14 +467,6 @@ struct ieee80211_fragment_cache {
unsigned int next;
};
-/*
- * The bandwidth threshold below which the per-station CoDel parameters will be
- * scaled to be more lenient (to prevent starvation of slow stations). This
- * value will be scaled by the number of active stations when it is being
- * applied.
- */
-#define STA_SLOW_THRESHOLD 6000 /* 6 Mbps */
-
/**
* struct link_sta_info - Link STA information
* All link specific sta info are stored here for reference. This can be
@@ -627,7 +619,6 @@ struct link_sta_info {
* @sta: station information we share with the driver
* @sta_state: duplicates information about station state (for debug)
* @rcu_head: RCU head used for freeing this station struct
- * @cparams: CoDel parameters for this station.
* @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
* @amsdu_mesh_control: track the mesh A-MSDU format used by the peer:
*
@@ -718,8 +709,6 @@ struct sta_info {
struct dentry *debugfs_dir;
#endif
- struct codel_params cparams;
-
u8 reserved_tid;
s8 amsdu_mesh_control;
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1402,16 +1402,9 @@ static struct sk_buff *fq_tin_dequeue_fu
local = container_of(fq, struct ieee80211_local, fq);
txqi = container_of(tin, struct txq_info, tin);
+ cparams = &local->cparams;
cstats = &txqi->cstats;
- if (txqi->txq.sta) {
- struct sta_info *sta = container_of(txqi->txq.sta,
- struct sta_info, sta);
- cparams = &sta->cparams;
- } else {
- cparams = &local->cparams;
- }
-
if (flow == &tin->default_flow)
cvars = &txqi->def_cvars;
else

View File

@@ -1,4 +1,4 @@
From 9c2fdcdf9d8963a6fa30005a859816639d0bbf95 Mon Sep 17 00:00:00 2001 From b3dc27f64b5d62505ae9f03a6c342a43b0b7e0b2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk> From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 May 2025 07:28:54 -0600 Date: Tue, 27 May 2025 07:28:54 -0600
Subject: Revert "Disable FOP_DONTCACHE for now due to bugs" Subject: Revert "Disable FOP_DONTCACHE for now due to bugs"

View File

@@ -1,70 +0,0 @@
From 1616d0edbdf3b36a8f4694d35bcf88fa1242c7e8 Mon Sep 17 00:00:00 2001
From: Jinliang Zheng <alexjlzheng@tencent.com>
Date: Tue, 15 Apr 2025 17:02:32 +0800
Subject: mm: fix ratelimit_pages update error in dirty_ratio_handler()
In dirty_ratio_handler(), vm_dirty_bytes must be set to zero before
calling writeback_set_ratelimit(), as global_dirty_limits() always
prioritizes the value of vm_dirty_bytes.
It's domain_dirty_limits() that's relevant here, not node_dirty_ok:
dirty_ratio_handler
writeback_set_ratelimit
global_dirty_limits(&dirty_thresh) <- ratelimit_pages based on dirty_thresh
domain_dirty_limits
if (bytes) <- bytes = vm_dirty_bytes <--------+
thresh = f1(bytes) <- prioritizes vm_dirty_bytes |
else |
thresh = f2(ratio) |
ratelimit_pages = f3(dirty_thresh) |
vm_dirty_bytes = 0 <- it's late! ---------------------+
This causes ratelimit_pages to still use the value calculated based on
vm_dirty_bytes, which is wrong now.
The impact visible to userspace is difficult to capture directly because
there is no procfs/sysfs interface exported to user space. However, it
will have a real impact on the balance of dirty pages.
For example:
1. On default, we have vm_dirty_ratio=40, vm_dirty_bytes=0
2. echo 8192 > dirty_bytes, then vm_dirty_bytes=8192,
vm_dirty_ratio=0, and ratelimit_pages is calculated based on
vm_dirty_bytes now.
3. echo 20 > dirty_ratio, then since vm_dirty_bytes is not reset to
zero when writeback_set_ratelimit() -> global_dirty_limits() ->
domain_dirty_limits() is called, reallimit_pages is still calculated
based on vm_dirty_bytes instead of vm_dirty_ratio. This does not
conform to the actual intent of the user.
Link: https://lkml.kernel.org/r/20250415090232.7544-1-alexjlzheng@tencent.com
Fixes: 9d823e8f6b1b ("writeback: per task dirty rate limit")
Signed-off-by: Jinliang Zheng <alexjlzheng@tencent.com>
Reviewed-by: MengEn Sun <mengensun@tencent.com>
Cc: Andrea Righi <andrea@betterlinux.com>
Cc: Fenggaung Wu <fengguang.wu@intel.com>
Cc: Jinliang Zheng <alexjlzheng@tencent.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/page-writeback.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -520,8 +520,8 @@ static int dirty_ratio_handler(const str
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}

View File

@@ -1,4 +1,4 @@
From 0274339dc053815d099e9c336f11c1e9e5641792 Mon Sep 17 00:00:00 2001 From 0b8d9b7ae677a03629218f69037be3f342c5ee81 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk> From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 May 2025 07:28:55 -0600 Date: Tue, 27 May 2025 07:28:55 -0600
Subject: mm/filemap: unify read/write dropbehind naming Subject: mm/filemap: unify read/write dropbehind naming

View File

@@ -1,179 +0,0 @@
From 87f7435508fde20e21c6b744723a3203e2045f46 Mon Sep 17 00:00:00 2001
From: GONG Ruiqi <gongruiqi1@huawei.com>
Date: Sun, 27 Apr 2025 10:53:03 +0800
Subject: vgacon: Add check for vc_origin address range in vgacon_scroll()
Our in-house Syzkaller reported the following BUG (twice), which we
believed was the same issue with [1]:
==================================================================
BUG: KASAN: slab-out-of-bounds in vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
Read of size 2 at addr ffff88800f5bef60 by task syz.7.2620/12393
...
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x72/0xa0 lib/dump_stack.c:106
print_address_description.constprop.0+0x6b/0x3d0 mm/kasan/report.c:364
print_report+0xba/0x280 mm/kasan/report.c:475
kasan_report+0xa9/0xe0 mm/kasan/report.c:588
vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
vcs_write_buf_noattr drivers/tty/vt/vc_screen.c:493 [inline]
vcs_write+0x586/0x840 drivers/tty/vt/vc_screen.c:690
vfs_write+0x219/0x960 fs/read_write.c:584
ksys_write+0x12e/0x260 fs/read_write.c:639
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
entry_SYSCALL_64_after_hwframe+0x78/0xe2
...
</TASK>
Allocated by task 5614:
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
kasan_set_track+0x25/0x30 mm/kasan/common.c:52
____kasan_kmalloc mm/kasan/common.c:374 [inline]
__kasan_kmalloc+0x8f/0xa0 mm/kasan/common.c:383
kasan_kmalloc include/linux/kasan.h:201 [inline]
__do_kmalloc_node mm/slab_common.c:1007 [inline]
__kmalloc+0x62/0x140 mm/slab_common.c:1020
kmalloc include/linux/slab.h:604 [inline]
kzalloc include/linux/slab.h:721 [inline]
vc_do_resize+0x235/0xf40 drivers/tty/vt/vt.c:1193
vgacon_adjust_height+0x2d4/0x350 drivers/video/console/vgacon.c:1007
vgacon_font_set+0x1f7/0x240 drivers/video/console/vgacon.c:1031
con_font_set drivers/tty/vt/vt.c:4628 [inline]
con_font_op+0x4da/0xa20 drivers/tty/vt/vt.c:4675
vt_k_ioctl+0xa10/0xb30 drivers/tty/vt/vt_ioctl.c:474
vt_ioctl+0x14c/0x1870 drivers/tty/vt/vt_ioctl.c:752
tty_ioctl+0x655/0x1510 drivers/tty/tty_io.c:2779
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:871 [inline]
__se_sys_ioctl+0x12d/0x190 fs/ioctl.c:857
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
entry_SYSCALL_64_after_hwframe+0x78/0xe2
Last potentially related work creation:
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
__kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
__call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
__sock_release+0xb5/0x270 net/socket.c:663
sock_close+0x1e/0x30 net/socket.c:1425
__fput+0x408/0xab0 fs/file_table.c:384
__fput_sync+0x4c/0x60 fs/file_table.c:465
__do_sys_close fs/open.c:1580 [inline]
__se_sys_close+0x68/0xd0 fs/open.c:1565
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
entry_SYSCALL_64_after_hwframe+0x78/0xe2
Second to last potentially related work creation:
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
__kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
__call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
__sock_release+0xb5/0x270 net/socket.c:663
sock_close+0x1e/0x30 net/socket.c:1425
__fput+0x408/0xab0 fs/file_table.c:384
task_work_run+0x154/0x240 kernel/task_work.c:239
exit_task_work include/linux/task_work.h:45 [inline]
do_exit+0x8e5/0x1320 kernel/exit.c:874
do_group_exit+0xcd/0x280 kernel/exit.c:1023
get_signal+0x1675/0x1850 kernel/signal.c:2905
arch_do_signal_or_restart+0x80/0x3b0 arch/x86/kernel/signal.c:310
exit_to_user_mode_loop kernel/entry/common.c:111 [inline]
exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
__syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline]
syscall_exit_to_user_mode+0x1b3/0x1e0 kernel/entry/common.c:218
do_syscall_64+0x66/0x110 arch/x86/entry/common.c:87
entry_SYSCALL_64_after_hwframe+0x78/0xe2
The buggy address belongs to the object at ffff88800f5be000
which belongs to the cache kmalloc-2k of size 2048
The buggy address is located 2656 bytes to the right of
allocated 1280-byte region [ffff88800f5be000, ffff88800f5be500)
...
Memory state around the buggy address:
ffff88800f5bee00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88800f5bee80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff88800f5bef00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
^
ffff88800f5bef80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88800f5bf000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==================================================================
By analyzing the vmcore, we found that vc->vc_origin was somehow placed
one line prior to vc->vc_screenbuf when vc was in KD_TEXT mode, and
further writings to /dev/vcs caused out-of-bounds reads (and writes
right after) in vcs_write_buf_noattr().
Our further experiments show that in most cases, vc->vc_origin equals to
vga_vram_base when the console is in KD_TEXT mode, and it's around
vc->vc_screenbuf for the KD_GRAPHICS mode. But via triggerring a
TIOCL_SETVESABLANK ioctl beforehand, we can make vc->vc_origin be around
vc->vc_screenbuf while the console is in KD_TEXT mode, and then by
writing the special 'ESC M' control sequence to the tty certain times
(depends on the value of `vc->state.y - vc->vc_top`), we can eventually
move vc->vc_origin prior to vc->vc_screenbuf. Here's the PoC, tested on
QEMU:
```
int main() {
const int RI_NUM = 10; // should be greater than `vc->state.y - vc->vc_top`
int tty_fd, vcs_fd;
const char *tty_path = "/dev/tty0";
const char *vcs_path = "/dev/vcs";
const char escape_seq[] = "\x1bM"; // ESC + M
const char trigger_seq[] = "Let's trigger an OOB write.";
struct vt_sizes vt_size = { 70, 2 };
int blank = TIOCL_BLANKSCREEN;
tty_fd = open(tty_path, O_RDWR);
char vesa_mode[] = { TIOCL_SETVESABLANK, 1 };
ioctl(tty_fd, TIOCLINUX, vesa_mode);
ioctl(tty_fd, TIOCLINUX, &blank);
ioctl(tty_fd, VT_RESIZE, &vt_size);
for (int i = 0; i < RI_NUM; ++i)
write(tty_fd, escape_seq, sizeof(escape_seq) - 1);
vcs_fd = open(vcs_path, O_RDWR);
write(vcs_fd, trigger_seq, sizeof(trigger_seq));
close(vcs_fd);
close(tty_fd);
return 0;
}
```
To solve this problem, add an address range validation check in
vgacon_scroll(), ensuring vc->vc_origin never precedes vc_screenbuf.
Reported-by: syzbot+9c09fda97a1a65ea859b@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=9c09fda97a1a65ea859b [1]
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Co-developed-by: Yi Yang <yiyang13@huawei.com>
Signed-off-by: Yi Yang <yiyang13@huawei.com>
Signed-off-by: GONG Ruiqi <gongruiqi1@huawei.com>
Signed-off-by: Helge Deller <deller@gmx.de>
---
drivers/video/console/vgacon.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -1168,7 +1168,7 @@ static bool vgacon_scroll(struct vc_data
c->vc_screenbuf_size - delta);
c->vc_origin = vga_vram_end - c->vc_screenbuf_size;
vga_rolled_over = 0;
- } else
+ } else if (oldo - delta >= (unsigned long)c->vc_screenbuf)
c->vc_origin -= delta;
c->vc_scr_end = c->vc_origin + c->vc_screenbuf_size;
scr_memsetw((u16 *) (c->vc_origin), c->vc_video_erase_char,

View File

@@ -1,102 +0,0 @@
From 4aed4d2a911e165342a339c886101dbe3acad5e2 Mon Sep 17 00:00:00 2001
From: Murad Masimov <m.masimov@mt-integration.ru>
Date: Mon, 28 Apr 2025 18:34:06 +0300
Subject: fbdev: Fix do_register_framebuffer to prevent null-ptr-deref in
fb_videomode_to_var
If fb_add_videomode() in do_register_framebuffer() fails to allocate
memory for fb_videomode, it will later lead to a null-ptr dereference in
fb_videomode_to_var(), as the fb_info is registered while not having the
mode in modelist that is expected to be there, i.e. the one that is
described in fb_info->var.
================================================================
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
Call Trace:
display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
resize_screen drivers/tty/vt/vt.c:1176 [inline]
vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
vfs_ioctl fs/ioctl.c:48 [inline]
__do_sys_ioctl fs/ioctl.c:753 [inline]
__se_sys_ioctl fs/ioctl.c:739 [inline]
__x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x67/0xd1
================================================================
Even though fbcon_init() checks beforehand if fb_match_mode() in
var_to_display() fails, it can not prevent the panic because fbcon_init()
does not return error code. Considering this and the comment in the code
about fb_match_mode() returning NULL - "This should not happen" - it is
better to prevent registering the fb_info if its mode was not set
successfully. Also move fb_add_videomode() closer to the beginning of
do_register_framebuffer() to avoid having to do the cleanup on fail.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
Signed-off-by: Helge Deller <deller@gmx.de>
---
drivers/video/fbdev/core/fbmem.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -388,7 +388,7 @@ static int fb_check_foreignness(struct f
static int do_register_framebuffer(struct fb_info *fb_info)
{
- int i;
+ int i, err = 0;
struct fb_videomode mode;
if (fb_check_foreignness(fb_info))
@@ -397,10 +397,18 @@ static int do_register_framebuffer(struc
if (num_registered_fb == FB_MAX)
return -ENXIO;
- num_registered_fb++;
for (i = 0 ; i < FB_MAX; i++)
if (!registered_fb[i])
break;
+
+ if (!fb_info->modelist.prev || !fb_info->modelist.next)
+ INIT_LIST_HEAD(&fb_info->modelist);
+
+ fb_var_to_videomode(&mode, &fb_info->var);
+ err = fb_add_videomode(&mode, &fb_info->modelist);
+ if (err < 0)
+ return err;
+
fb_info->node = i;
refcount_set(&fb_info->count, 1);
mutex_init(&fb_info->lock);
@@ -426,16 +434,12 @@ static int do_register_framebuffer(struc
if (bitmap_empty(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT))
bitmap_fill(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT);
- if (!fb_info->modelist.prev || !fb_info->modelist.next)
- INIT_LIST_HEAD(&fb_info->modelist);
-
if (fb_info->skip_vt_switch)
pm_vt_switch_required(fb_info->device, false);
else
pm_vt_switch_required(fb_info->device, true);
- fb_var_to_videomode(&mode, &fb_info->var);
- fb_add_videomode(&mode, &fb_info->modelist);
+ num_registered_fb++;
registered_fb[i] = fb_info;
#ifdef CONFIG_GUMSTIX_AM200EPD

View File

@@ -1,4 +1,4 @@
From de09560d2e6fbb14ea586063217277e5ebc1bc71 Mon Sep 17 00:00:00 2001 From 2c1c3b3aafb153cbc3bd298db57cc7313d1601b1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk> From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 May 2025 07:28:56 -0600 Date: Tue, 27 May 2025 07:28:56 -0600
Subject: mm/filemap: unify dropbehind flag testing and clearing Subject: mm/filemap: unify dropbehind flag testing and clearing

View File

@@ -1,65 +0,0 @@
From 10c7fce24a1ad9197a8eabbba454a9a872f03d5c Mon Sep 17 00:00:00 2001
From: Murad Masimov <m.masimov@mt-integration.ru>
Date: Mon, 28 Apr 2025 18:34:07 +0300
Subject: fbdev: Fix fb_set_var to prevent null-ptr-deref in
fb_videomode_to_var
If fb_add_videomode() in fb_set_var() fails to allocate memory for
fb_videomode, later it may lead to a null-ptr dereference in
fb_videomode_to_var(), as the fb_info is registered while not having the
mode in modelist that is expected to be there, i.e. the one that is
described in fb_info->var.
================================================================
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
Call Trace:
display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
resize_screen drivers/tty/vt/vt.c:1176 [inline]
vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
vfs_ioctl fs/ioctl.c:48 [inline]
__do_sys_ioctl fs/ioctl.c:753 [inline]
__se_sys_ioctl fs/ioctl.c:739 [inline]
__x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x67/0xd1
================================================================
The reason is that fb_info->var is being modified in fb_set_var(), and
then fb_videomode_to_var() is called. If it fails to add the mode to
fb_info->modelist, fb_set_var() returns error, but does not restore the
old value of fb_info->var. Restore fb_info->var on failure the same way
it is done earlier in the function.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
Signed-off-by: Helge Deller <deller@gmx.de>
---
drivers/video/fbdev/core/fbmem.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -328,8 +328,10 @@ fb_set_var(struct fb_info *info, struct
!list_empty(&info->modelist))
ret = fb_add_videomode(&mode, &info->modelist);
- if (ret)
+ if (ret) {
+ info->var = old_var;
return ret;
+ }
event.info = info;
event.data = &mode;

View File

@@ -1,4 +1,4 @@
From c041325f222c774573ad73d35939451a4e221e52 Mon Sep 17 00:00:00 2001 From 61d27e9dadb2eb2b7596a11a37402452d97625f7 Mon Sep 17 00:00:00 2001
From: Shivank Garg <shivankg@amd.com> From: Shivank Garg <shivankg@amd.com>
Date: Mon, 26 May 2025 18:28:18 +0000 Date: Mon, 26 May 2025 18:28:18 +0000
Subject: mm/khugepaged: fix race with folio split/free using temporary Subject: mm/khugepaged: fix race with folio split/free using temporary

View File

@@ -1,113 +0,0 @@
From 13ccad7713b89e7693feb5346e7893dc8edce7a8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Apr 2025 11:54:15 +0200
Subject: anon_inode: use a proper mode internally
This allows the VFS to not trip over anonymous inodes and we can add
asserts based on the mode into the vfs. When we report it to userspace
we can simply hide the mode to avoid regressions. I've audited all
direct callers of alloc_anon_inode() and only secretmen overrides i_mode
and i_op inode operations but it already uses a regular file.
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-1-53a44c20d44e@kernel.org
Fixes: af153bb63a336 ("vfs: catch invalid modes in may_open()")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Cc: stable@vger.kernel.org # all LTS kernels
Reported-by: syzbot+5d8e79d323a13aa0b248@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/67ed3fb3.050a0220.14623d.0009.GAE@google.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/anon_inodes.c | 36 ++++++++++++++++++++++++++++++++++++
fs/internal.h | 3 +++
fs/libfs.c | 8 +++++++-
3 files changed, 46 insertions(+), 1 deletion(-)
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,10 +24,44 @@
#include <linux/uaccess.h>
+#include "internal.h"
+
static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;
/*
+ * User space expects anonymous inodes to have no file type in st_mode.
+ *
+ * In particular, 'lsof' has this legacy logic:
+ *
+ * type = s->st_mode & S_IFMT;
+ * switch (type) {
+ * ...
+ * case 0:
+ * if (!strcmp(p, "anon_inode"))
+ * Lf->ntype = Ntype = N_ANON_INODE;
+ *
+ * to detect our old anon_inode logic.
+ *
+ * Rather than mess with our internal sane inode data, just fix it
+ * up here in getattr() by masking off the format bits.
+ */
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ stat->mode &= ~S_IFMT;
+ return 0;
+}
+
+static const struct inode_operations anon_inode_operations = {
+ .getattr = anon_inode_getattr,
+};
+
+/*
* anon_inodefs_dname() is called from d_path().
*/
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -66,6 +100,7 @@ static struct inode *anon_inode_make_sec
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
+ inode->i_op = &anon_inode_operations;
error = security_inode_init_security_anon(inode, &QSTR(name),
context_inode);
if (error) {
@@ -313,6 +348,7 @@ static int __init anon_inode_init(void)
anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
if (IS_ERR(anon_inode_inode))
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
+ anon_inode_inode->i_op = &anon_inode_operations;
return 0;
}
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -343,3 +343,6 @@ static inline bool path_mounted(const st
void file_f_owner_release(struct file *file);
bool file_seek_cur_needs_f_lock(struct file *file);
int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags);
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1647,7 +1647,13 @@ struct inode *alloc_anon_inode(struct su
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
- inode->i_mode = S_IRUSR | S_IWUSR;
+ /*
+ * Historically anonymous inodes didn't have a type at all and
+ * userspace has come to rely on this. Internally they're just
+ * regular files but S_IFREG is masked off when reporting
+ * information to userspace.
+ */
+ inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;

View File

@@ -1,4 +1,4 @@
From 76653593bdf5fda03717991681b5d60e2af015e9 Mon Sep 17 00:00:00 2001 From 8135974e9e512fdf6d15f59947f95e44f2834c37 Mon Sep 17 00:00:00 2001
From: Shivank Garg <shivankg@amd.com> From: Shivank Garg <shivankg@amd.com>
Date: Wed, 30 Apr 2025 10:01:51 +0000 Date: Wed, 30 Apr 2025 10:01:51 +0000
Subject: mm: add folio_expected_ref_count() for reference count calculation Subject: mm: add folio_expected_ref_count() for reference count calculation

View File

@@ -1,80 +0,0 @@
From 5a3eea2c3e9675a8b713eef0d52b7c437f1f613b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Apr 2025 11:54:17 +0200
Subject: anon_inode: explicitly block ->setattr()
It is currently possible to change the mode and owner of the single
anonymous inode in the kernel:
int main(int argc, char *argv[])
{
int ret, sfd;
sigset_t mask;
struct signalfd_siginfo fdsi;
sigemptyset(&mask);
sigaddset(&mask, SIGINT);
sigaddset(&mask, SIGQUIT);
ret = sigprocmask(SIG_BLOCK, &mask, NULL);
if (ret < 0)
_exit(1);
sfd = signalfd(-1, &mask, 0);
if (sfd < 0)
_exit(2);
ret = fchown(sfd, 5555, 5555);
if (ret < 0)
_exit(3);
ret = fchmod(sfd, 0777);
if (ret < 0)
_exit(3);
_exit(4);
}
This is a bug. It's not really a meaningful one because anonymous inodes
don't really figure into path lookup and they cannot be reopened via
/proc/<pid>/fd/<nr> and can't be used for lookup itself. So they can
only ever serve as direct references.
But it is still completely bogus to allow the mode and ownership or any
of the properties of the anonymous inode to be changed. Block this!
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-3-53a44c20d44e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Cc: stable@vger.kernel.org # all LTS kernels
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/anon_inodes.c | 7 +++++++
fs/internal.h | 2 ++
2 files changed, 9 insertions(+)
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -57,8 +57,15 @@ int anon_inode_getattr(struct mnt_idmap
return 0;
}
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
static const struct inode_operations anon_inode_operations = {
.getattr = anon_inode_getattr,
+ .setattr = anon_inode_setattr,
};
/*
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -346,3 +346,5 @@ int statmount_mnt_idmap(struct mnt_idmap
int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags);
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr);

View File

@@ -1,4 +1,4 @@
From 96e19aa45a528ce5c722f1925d750f74efe22a8b Mon Sep 17 00:00:00 2001 From 3d1a493525955678c231ab7ccf0950c0ba2b9f45 Mon Sep 17 00:00:00 2001
From: Ankit Nautiyal <ankit.k.nautiyal@intel.com> From: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Date: Fri, 13 Jun 2025 11:42:46 +0530 Date: Fri, 13 Jun 2025 11:42:46 +0530
Subject: drm/i915/snps_hdmi_pll: Fix 64-bit divisor truncation by using Subject: drm/i915/snps_hdmi_pll: Fix 64-bit divisor truncation by using

View File

@@ -1,39 +0,0 @@
From 8c9775d285f9755477a8b1f8b215102dce014ed2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 7 Apr 2025 11:54:19 +0200
Subject: anon_inode: raise SB_I_NODEV and SB_I_NOEXEC
It isn't possible to execute anonymous inodes because they cannot be
opened in any way after they have been created. This includes execution:
execveat(fd_anon_inode, "", NULL, NULL, AT_EMPTY_PATH)
Anonymous inodes have inode->f_op set to no_open_fops which sets
no_open() which returns ENXIO. That means any call to do_dentry_open()
which is the endpoint of the do_open_execat() will fail. There's no
chance to execute an anonymous inode. Unless a given subsystem overrides
it ofc.
However, we should still harden this and raise SB_I_NODEV and
SB_I_NOEXEC on the superblock itself so that no one gets any creative
ideas.
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-5-53a44c20d44e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Cc: stable@vger.kernel.org # all LTS kernels
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/anon_inodes.c | 2 ++
1 file changed, 2 insertions(+)
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -86,6 +86,8 @@ static int anon_inodefs_init_fs_context(
struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
ctx->dops = &anon_inodefs_dentry_operations;
return 0;
}

View File

@@ -0,0 +1,190 @@
From 3a317593ed60909e02e059a43b2ef588f95fd457 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 10 Jun 2025 01:17:51 +0800
Subject: mm/shmem, swap: fix softlockup with mTHP swapin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Following softlockup can be easily reproduced on my test machine with:
echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
swapon /dev/zram0 # zram0 is a 48G swap device
mkdir -p /sys/fs/cgroup/memory/test
echo 1G > /sys/fs/cgroup/test/memory.max
echo $BASHPID > /sys/fs/cgroup/test/cgroup.procs
while true; do
dd if=/dev/zero of=/tmp/test.img bs=1M count=5120
cat /tmp/test.img > /dev/null
rm /tmp/test.img
done
Then after a while:
watchdog: BUG: soft lockup - CPU#0 stuck for 763s! [cat:5787]
Modules linked in: zram virtiofs
CPU: 0 UID: 0 PID: 5787 Comm: cat Kdump: loaded Tainted: G L 6.15.0.orig-gf3021d9246bc-dirty #118 PREEMPT(voluntary)·
Tainted: [L]=SOFTLOCKUP
Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015
RIP: 0010:mpol_shared_policy_lookup+0xd/0x70
Code: e9 b8 b4 ff ff 31 c0 c3 cc cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 0f 1f 44 00 00 41 54 55 53 <48> 8b 1f 48 85 db 74 41 4c 8d 67 08 48 89 fb 48 89 f5 4c 89 e7 e8
RSP: 0018:ffffc90002b1fc28 EFLAGS: 00000202
RAX: 00000000001c20ca RBX: 0000000000724e1e RCX: 0000000000000001
RDX: ffff888118e214c8 RSI: 0000000000057d42 RDI: ffff888118e21518
RBP: 000000000002bec8 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000bf4 R11: 0000000000000000 R12: 0000000000000001
R13: 00000000001c20ca R14: 00000000001c20ca R15: 0000000000000000
FS: 00007f03f995c740(0000) GS:ffff88a07ad9a000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f03f98f1000 CR3: 0000000144626004 CR4: 0000000000770eb0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
shmem_alloc_folio+0x31/0xc0
shmem_swapin_folio+0x309/0xcf0
? filemap_get_entry+0x117/0x1e0
? xas_load+0xd/0xb0
? filemap_get_entry+0x101/0x1e0
shmem_get_folio_gfp+0x2ed/0x5b0
shmem_file_read_iter+0x7f/0x2e0
vfs_read+0x252/0x330
ksys_read+0x68/0xf0
do_syscall_64+0x4c/0x1c0
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f03f9a46991
Code: 00 48 8b 15 81 14 10 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8 20 ad 01 00 f3 0f 1e fa 80 3d 35 97 10 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec
RSP: 002b:00007fff3c52bd28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 0000000000040000 RCX: 00007f03f9a46991
RDX: 0000000000040000 RSI: 00007f03f98ba000 RDI: 0000000000000003
RBP: 00007fff3c52bd50 R08: 0000000000000000 R09: 00007f03f9b9a380
R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000040000
R13: 00007f03f98ba000 R14: 0000000000000003 R15: 0000000000000000
</TASK>
The reason is simple, readahead brought some order 0 folio in swap cache,
and the swapin mTHP folio being allocated is in conflict with it, so
swapcache_prepare fails and causes shmem_swap_alloc_folio to return
-EEXIST, and shmem simply retries again and again causing this loop.
Fix it by applying a similar fix for anon mTHP swapin.
The performance change is very slight, time of swapin 10g zero folios
with shmem (test for 12 times):
Before: 2.47s
After: 2.48s
[kasong@tencent.com: add comment]
Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20250609171751.36305-1-ryncsn@gmail.com
Fixes: 1dd44c0af4fa ("mm: shmem: skip swapcache for swapin of synchronous swap device")
Signed-off-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/memory.c | 20 --------------------
mm/shmem.c | 6 +++++-
mm/swap.h | 23 +++++++++++++++++++++++
3 files changed, 28 insertions(+), 21 deletions(-)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4225,26 +4225,6 @@ static struct folio *__alloc_swap_folio(
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- struct swap_info_struct *si = swp_swap_info(entry);
- pgoff_t offset = swp_offset(entry);
- int i;
-
- /*
- * While allocating a large folio and doing swap_read_folio, which is
- * the case the being faulted pte doesn't have swapcache. We need to
- * ensure all PTEs have no cache as well, otherwise, we might go to
- * swap devices while the content is in swapcache.
- */
- for (i = 0; i < max_nr; i++) {
- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
- return i;
- }
-
- return i;
-}
-
/*
* Check if the PTEs within a range are contiguous swap entries
* and have consistent swapcache, zeromap.
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2262,6 +2262,7 @@ static int shmem_swapin_folio(struct ino
folio = swap_cache_get_folio(swap, NULL, 0);
order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
+ int nr_pages = 1 << order;
bool fallback_order0 = false;
/* Or update major stats only when swapin succeeds?? */
@@ -2275,9 +2276,12 @@ static int shmem_swapin_folio(struct ino
* If uffd is active for the vma, we need per-page fault
* fidelity to maintain the uffd semantics, then fallback
* to swapin order-0 folio, as well as for zswap case.
+ * Any existing sub folio in the swap cache also blocks
+ * mTHP swapin.
*/
if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
- !zswap_never_enabled()))
+ !zswap_never_enabled() ||
+ non_swapcache_batch(swap, nr_pages) != nr_pages))
fallback_order0 = true;
/* Skip swapcache for synchronous device. */
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp
return find_next_bit(sis->zeromap, end, start) - start;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ struct swap_info_struct *si = swp_swap_info(entry);
+ pgoff_t offset = swp_offset(entry);
+ int i;
+
+ /*
+ * While allocating a large folio and doing mTHP swapin, we need to
+ * ensure all entries are not cached, otherwise, the mTHP folio will
+ * be in conflict with the folio in swap cache.
+ */
+ for (i = 0; i < max_nr; i++) {
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+ return i;
+ }
+
+ return i;
+}
+
#else /* CONFIG_SWAP */
struct swap_iocb;
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
@@ -199,6 +218,10 @@ static inline int swap_zeromap_batch(swp
return 0;
}
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ return 0;
+}
#endif /* CONFIG_SWAP */
#endif /* _MM_SWAP_H */

View File

@@ -1,136 +0,0 @@
From d90681a50098e204f2e111b9433f6fc73a939854 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 21 Apr 2025 10:27:40 +0200
Subject: fs: add S_ANON_INODE
This makes it easy to detect proper anonymous inodes and to ensure that
we can detect them in codepaths such as readahead().
Readahead on anonymous inodes didn't work because they didn't have a
proper mode. Now that they have we need to retain EINVAL being returned
otherwise LTP will fail.
We also need to ensure that ioctls aren't simply fired like they are for
regular files so things like inotify inodes continue to correctly call
their own ioctl handlers as in [1].
Reported-by: Xilin Wu <sophon@radxa.com>
Link: https://lore.kernel.org/3A9139D5CD543962+89831381-31b9-4392-87ec-a84a5b3507d8@radxa.com [1]
Link: https://lore.kernel.org/7a1a7076-ff6b-4cb0-94e7-7218a0a44028@sirena.org.uk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/ioctl.c | 7 ++++---
fs/libfs.c | 2 +-
fs/pidfs.c | 2 +-
include/linux/fs.h | 2 ++
mm/readahead.c | 20 ++++++++++++++++----
5 files changed, 24 insertions(+), 9 deletions(-)
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -821,7 +821,8 @@ static int do_vfs_ioctl(struct file *fil
return ioctl_fioasync(fd, filp, argp);
case FIOQSIZE:
- if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
+ if (S_ISDIR(inode->i_mode) ||
+ (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) ||
S_ISLNK(inode->i_mode)) {
loff_t res = inode_get_bytes(inode);
return copy_to_user(argp, &res, sizeof(res)) ?
@@ -856,7 +857,7 @@ static int do_vfs_ioctl(struct file *fil
return ioctl_file_dedupe_range(filp, argp);
case FIONREAD:
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode))
return vfs_ioctl(filp, cmd, arg);
return put_user(i_size_read(inode) - filp->f_pos,
@@ -881,7 +882,7 @@ static int do_vfs_ioctl(struct file *fil
return ioctl_get_fs_sysfs_path(filp, argp);
default:
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode))
return file_ioctl(filp, cmd, argp);
break;
}
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1656,7 +1656,7 @@ struct inode *alloc_anon_inode(struct su
inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_flags |= S_PRIVATE;
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
simple_inode_init_ts(inode);
return inode;
}
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -826,7 +826,7 @@ static int pidfs_init_inode(struct inode
const struct pid *pid = data;
inode->i_private = data;
- inode->i_flags |= S_PRIVATE;
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2344,6 +2344,7 @@ struct super_operations {
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
+#define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2400,6 +2401,7 @@ static inline bool sb_rdonly(const struc
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
+#define IS_ANON_FILE(inode) ((inode)->i_flags & S_ANON_INODE)
static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
struct inode *inode)
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
+ struct file *file;
+ const struct inode *inode;
+
CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ file = fd_file(f);
+ if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t of
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+ if (!file->f_mapping)
+ return -EINVAL;
+ if (!file->f_mapping->a_ops)
+ return -EINVAL;
+
+ inode = file_inode(file);
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -EINVAL;
+ if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);

View File

@@ -0,0 +1,100 @@
From 4b247e559e4046bbbfab468e66f9d3197eaf12ec Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 11 Jun 2025 15:13:14 +0200
Subject: mm/gup: revert "mm: gup: fix infinite loop within
__get_longterm_locked"
After commit 1aaf8c122918 ("mm: gup: fix infinite loop within
__get_longterm_locked") we are able to longterm pin folios that are not
supposed to get longterm pinned, simply because they temporarily have the
LRU flag cleared (esp. temporarily isolated).
For example, two __get_longterm_locked() callers can race, or
__get_longterm_locked() can race with anything else that temporarily
isolates folios.
The introducing commit mentions the use case of a driver that uses
vm_ops->fault to insert pages allocated through cma_alloc() into the page
tables, assuming they can later get longterm pinned. These pages/ folios
would never have the LRU flag set and consequently cannot get isolated.
There is no known in-tree user making use of that so far, fortunately.
To handle that in the future -- and avoid retrying forever to
isolate/migrate them -- we will need a different mechanism for the CMA
area *owner* to indicate that it actually already allocated the page and
is fine with longterm pinning it. The LRU flag is not suitable for that.
Probably we can lookup the relevant CMA area and query the bitmap; we only
have have to care about some races, probably. If already allocated, we
could just allow longterm pinning)
Anyhow, let's fix the "must not be longterm pinned" problem first by
reverting the original commit.
Link: https://lkml.kernel.org/r/20250611131314.594529-1-david@redhat.com
Fixes: 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked")
Signed-off-by: David Hildenbrand <david@redhat.com>
Closes: https://lore.kernel.org/all/20250522092755.GA3277597@tiffany/
Reported-by: Hyesoo Yu <hyesoo.yu@samsung.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Peter Xu <peterx@redhat.com>
Cc: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Cc: Aijun Sun <aijun.sun@unisoc.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/gup.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2320,13 +2320,13 @@ static void pofs_unpin(struct pages_or_f
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
-static void collect_longterm_unpinnable_folios(
+static unsigned long collect_longterm_unpinnable_folios(
struct list_head *movable_folio_list,
struct pages_or_folios *pofs)
{
+ unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
bool drain_allow = true;
- unsigned long i;
for (i = 0; i < pofs->nr_entries; i++) {
struct folio *folio = pofs_get_folio(pofs, i);
@@ -2338,6 +2338,8 @@ static void collect_longterm_unpinnable_
if (folio_is_longterm_pinnable(folio))
continue;
+ collected++;
+
if (folio_is_device_coherent(folio))
continue;
@@ -2359,6 +2361,8 @@ static void collect_longterm_unpinnable_
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
+
+ return collected;
}
/*
@@ -2435,9 +2439,11 @@ static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
LIST_HEAD(movable_folio_list);
+ unsigned long collected;
- collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
- if (list_empty(&movable_folio_list))
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+ pofs);
+ if (!collected)
return 0;
return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);

View File

@@ -1,35 +0,0 @@
From c161e0ffb55a12b9b26819fa0ecf8217ab781e97 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Wed, 7 May 2025 19:50:26 +0800
Subject: configfs: Do not override creating attribute file failure in
populate_attrs()
populate_attrs() may override failure for creating attribute files
by success for creating subsequent bin attribute files, and have
wrong return value.
Fix by creating bin attribute files under successfully creating
attribute files.
Fixes: 03607ace807b ("configfs: implement binary attributes")
Cc: stable@vger.kernel.org
Reviewed-by: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Link: https://lore.kernel.org/r/20250507-fix_configfs-v3-2-fe2d96de8dc4@quicinc.com
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
---
fs/configfs/dir.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -619,7 +619,7 @@ static int populate_attrs(struct config_
break;
}
}
- if (t->ct_bin_attrs) {
+ if (!error && t->ct_bin_attrs) {
for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i))
continue;

View File

@@ -0,0 +1,191 @@
From 7ebf89b788aa5b83897e99ad6e3dd6e0cb0f5030 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Wed, 4 Jun 2025 23:10:38 +0800
Subject: mm: userfaultfd: fix race of userfaultfd_move and swap cache
This commit fixes two kinds of races, they may have different results:
Barry reported a BUG_ON in commit c50f8e6053b0, we may see the same
BUG_ON if the filemap lookup returned NULL and folio is added to swap
cache after that.
If another kind of race is triggered (folio changed after lookup) we
may see RSS counter is corrupted:
[ 406.893936] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0
type:MM_ANONPAGES val:-1
[ 406.894071] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0
type:MM_SHMEMPAGES val:1
Because the folio is being accounted to the wrong VMA.
I'm not sure if there will be any data corruption though, seems no.
The issues above are critical already.
On seeing a swap entry PTE, userfaultfd_move does a lockless swap cache
lookup, and tries to move the found folio to the faulting vma. Currently,
it relies on checking the PTE value to ensure that the moved folio still
belongs to the src swap entry and that no new folio has been added to the
swap cache, which turns out to be unreliable.
While working and reviewing the swap table series with Barry, following
existing races are observed and reproduced [1]:
In the example below, move_pages_pte is moving src_pte to dst_pte, where
src_pte is a swap entry PTE holding swap entry S1, and S1 is not in the
swap cache:
CPU1 CPU2
userfaultfd_move
move_pages_pte()
entry = pte_to_swp_entry(orig_src_pte);
// Here it got entry = S1
... < interrupted> ...
<swapin src_pte, alloc and use folio A>
// folio A is a new allocated folio
// and get installed into src_pte
<frees swap entry S1>
// src_pte now points to folio A, S1
// has swap count == 0, it can be freed
// by folio_swap_swap or swap
// allocator's reclaim.
<try to swap out another folio B>
// folio B is a folio in another VMA.
<put folio B to swap cache using S1 >
// S1 is freed, folio B can use it
// for swap out with no problem.
...
folio = filemap_get_folio(S1)
// Got folio B here !!!
... < interrupted again> ...
<swapin folio B and free S1>
// Now S1 is free to be used again.
<swapout src_pte & folio A using S1>
// Now src_pte is a swap entry PTE
// holding S1 again.
folio_trylock(folio)
move_swap_pte
double_pt_lock
is_pte_pages_stable
// Check passed because src_pte == S1
folio_move_anon_rmap(...)
// Moved invalid folio B here !!!
The race window is very short and requires multiple collisions of multiple
rare events, so it's very unlikely to happen, but with a deliberately
constructed reproducer and increased time window, it can be reproduced
easily.
This can be fixed by checking if the folio returned by filemap is the
valid swap cache folio after acquiring the folio lock.
Another similar race is possible: filemap_get_folio may return NULL, but
folio (A) could be swapped in and then swapped out again using the same
swap entry after the lookup. In such a case, folio (A) may remain in the
swap cache, so it must be moved too:
CPU1 CPU2
userfaultfd_move
move_pages_pte()
entry = pte_to_swp_entry(orig_src_pte);
// Here it got entry = S1, and S1 is not in swap cache
folio = filemap_get_folio(S1)
// Got NULL
... < interrupted again> ...
<swapin folio A and free S1>
<swapout folio A re-using S1>
move_swap_pte
double_pt_lock
is_pte_pages_stable
// Check passed because src_pte == S1
folio_move_anon_rmap(...)
// folio A is ignored !!!
Fix this by checking the swap cache again after acquiring the src_pte
lock. And to avoid the filemap overhead, we check swap_map directly [2].
The SWP_SYNCHRONOUS_IO path does make the problem more complex, but so far
we don't need to worry about that, since folios can only be exposed to the
swap cache in the swap out path, and this is covered in this patch by
checking the swap cache again after acquiring the src_pte lock.
Testing with a simple C program that allocates and moves several GB of
memory did not show any observable performance change.
Link: https://lkml.kernel.org/r/20250604151038.21968-1-ryncsn@gmail.com
Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
Signed-off-by: Kairui Song <kasong@tencent.com>
Closes: https://lore.kernel.org/linux-mm/CAMgjq7B1K=6OOrK2OUZ0-tqCzi+EJt+2_K97TPGoSt=9+JwP7Q@mail.gmail.com/ [1]
Link: https://lore.kernel.org/all/CAGsJ_4yJhJBo16XhiC-nUzSheyX-V3-nFE+tAi=8Y560K8eT=A@mail.gmail.com/ [2]
Reviewed-by: Lokesh Gidra <lokeshgidra@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Chris Li <chrisl@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/userfaultfd.c | 33 +++++++++++++++++++++++++++++++--
1 file changed, 31 insertions(+), 2 deletions(-)
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struc
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio *src_folio)
+ struct folio *src_folio,
+ struct swap_info_struct *si, swp_entry_t entry)
{
+ /*
+ * Check if the folio still belongs to the target swap entry after
+ * acquiring the lock. Folio can be freed in the swap cache while
+ * not locked.
+ */
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
+ entry.val != src_folio->swap.val))
+ return -EAGAIN;
+
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struc
if (src_folio) {
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
+ } else {
+ /*
+ * Check if the swap entry is cached after acquiring the src_pte
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
+ *
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
+ * We are trying to catch newly added swap cache, the only possible case is
+ * when a folio is swapped in and out again staying in swap cache, using the
+ * same entry before the PTE check above. The PTL is acquired and released
+ * twice, each time after updating the swap_map's flag. So holding
+ * the PTL here ensures we see the updated value. False positive is possible,
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
+ * cache, or during the tiny synchronization window between swap cache and
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ */
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
@@ -1412,7 +1441,7 @@ retry:
}
err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
- dst_ptl, src_ptl, src_folio);
+ dst_ptl, src_ptl, src_folio, si, entry);
}
out:

View File

@@ -0,0 +1,26 @@
From 222985dcb732fae554af5276f44c30d648a1d05b Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Tue, 10 Jun 2025 20:53:30 +0200
Subject: dm-raid: fix variable in journal device check
Replace "rdev" with correct loop variable name "r".
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 63c32ed4afc2 ("dm raid: add raid4/5/6 journaling support")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
drivers/md/dm-raid.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2410,7 +2410,7 @@ static int super_init_validation(struct
*/
sb_retrieve_failed_devices(sb, failed_devices);
rdev_for_each(r, mddev) {
- if (test_bit(Journal, &rdev->flags) ||
+ if (test_bit(Journal, &r->flags) ||
!r->sb_page)
continue;
sb2 = page_address(r->sb_page);

View File

@@ -1,129 +0,0 @@
From 1e9a258def978a9388a50ae43c85557b0598a7d3 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Thu, 29 May 2025 15:56:47 +0000
Subject: mm: fix uprobe pte be overwritten when expanding vma
Patch series "Fix uprobe pte be overwritten when expanding vma".
This patch (of 4):
We encountered a BUG alert triggered by Syzkaller as follows:
BUG: Bad rss-counter state mm:00000000b4a60fca type:MM_ANONPAGES val:1
And we can reproduce it with the following steps:
1. register uprobe on file at zero offset
2. mmap the file at zero offset:
addr1 = mmap(NULL, 2 * 4096, PROT_NONE, MAP_PRIVATE, fd, 0);
3. mremap part of vma1 to new vma2:
addr2 = mremap(addr1, 4096, 2 * 4096, MREMAP_MAYMOVE);
4. mremap back to orig addr1:
mremap(addr2, 4096, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, addr1);
In step 3, the vma1 range [addr1, addr1 + 4096] will be remap to new vma2
with range [addr2, addr2 + 8192], and remap uprobe anon page from the vma1
to vma2, then unmap the vma1 range [addr1, addr1 + 4096].
In step 4, the vma2 range [addr2, addr2 + 4096] will be remap back to the
addr range [addr1, addr1 + 4096]. Since the addr range [addr1 + 4096,
addr1 + 8192] still maps the file, it will take vma_merge_new_range to
expand the range, and then do uprobe_mmap in vma_complete. Since the
merged vma pgoff is also zero offset, it will install uprobe anon page to
the merged vma. However, the upcomming move_page_tables step, which use
set_pte_at to remap the vma2 uprobe pte to the merged vma, will overwrite
the newly uprobe pte in the merged vma, and lead that pte to be orphan.
Since the uprobe pte will be remapped to the merged vma, we can remove the
unnecessary uprobe_mmap upon merged vma.
This problem was first found in linux-6.6.y and also exists in the
community syzkaller:
https://lore.kernel.org/all/000000000000ada39605a5e71711@google.com/T/
Link: https://lkml.kernel.org/r/20250529155650.4017699-1-pulehui@huaweicloud.com
Link: https://lkml.kernel.org/r/20250529155650.4017699-2-pulehui@huaweicloud.com
Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/vma.c | 20 +++++++++++++++++---
mm/vma.h | 7 +++++++
2 files changed, 24 insertions(+), 3 deletions(-)
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -144,6 +144,9 @@ static void init_multi_vma_prep(struct v
vp->file = vma->vm_file;
if (vp->file)
vp->mapping = vma->vm_file->f_mapping;
+
+ if (vmg && vmg->skip_vma_uprobe)
+ vp->skip_vma_uprobe = true;
}
/*
@@ -333,10 +336,13 @@ static void vma_complete(struct vma_prep
if (vp->file) {
i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
+ if (!vp->skip_vma_uprobe) {
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
}
if (vp->remove) {
@@ -1783,6 +1789,14 @@ struct vm_area_struct *copy_vma(struct v
faulted_in_anon_vma = false;
}
+ /*
+ * If the VMA we are copying might contain a uprobe PTE, ensure
+ * that we do not establish one upon merge. Otherwise, when mremap()
+ * moves page tables, it will orphan the newly created PTE.
+ */
+ if (vma->vm_file)
+ vmg.skip_vma_uprobe = true;
+
new_vma = find_vma_prev(mm, addr, &vmg.prev);
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -19,6 +19,8 @@ struct vma_prepare {
struct vm_area_struct *insert;
struct vm_area_struct *remove;
struct vm_area_struct *remove2;
+
+ bool skip_vma_uprobe :1;
};
struct unlink_vma_file_batch {
@@ -120,6 +122,11 @@ struct vma_merge_struct {
*/
bool give_up_on_oom :1;
+ /*
+ * If set, skip uprobe_mmap upon merged vma.
+ */
+ bool skip_vma_uprobe :1;
+
/* Internal flags set during merge process: */
/*

View File

@@ -1,217 +0,0 @@
From 2d8c79ec421253aab9560a47a7e73d678c84585c Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 27 May 2025 23:23:53 +0200
Subject: mm/hugetlb: unshare page tables during VMA split, not before
Currently, __split_vma() triggers hugetlb page table unsharing through
vm_ops->may_split(). This happens before the VMA lock and rmap locks are
taken - which is too early, it allows racing VMA-locked page faults in our
process and racing rmap walks from other processes to cause page tables to
be shared again before we actually perform the split.
Fix it by explicitly calling into the hugetlb unshare logic from
__split_vma() in the same place where THP splitting also happens. At that
point, both the VMA and the rmap(s) are write-locked.
An annoying detail is that we can now call into the helper
hugetlb_unshare_pmds() from two different locking contexts:
1. from hugetlb_split(), holding:
- mmap lock (exclusively)
- VMA lock
- file rmap lock (exclusively)
2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
call us with only the mmap lock held (in shared mode), but currently
only runs while holding mmap lock (exclusively) and VMA lock
Backporting note:
This commit fixes a racy protection that was introduced in commit
b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
commit claimed to fix an issue introduced in 5.13, but it should actually
also go all the way back.
[jannh@google.com: v2]
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org> [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/hugetlb.h | 3 ++
mm/hugetlb.c | 60 +++++++++++++++++++++++---------
mm/vma.c | 7 ++++
tools/testing/vma/vma_internal.h | 2 ++
4 files changed, 56 insertions(+), 16 deletions(-)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -276,6 +276,7 @@ bool is_hugetlb_entry_migration(pte_t pt
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -473,6 +474,8 @@ static inline void fixup_hugetlb_reserva
{
}
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
+
#endif /* !CONFIG_HUGETLB_PAGE */
#ifndef pgd_write
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -120,7 +120,7 @@ static void hugetlb_vma_lock_free(struct
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+ unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
@@ -5426,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
+ return 0;
+}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
+ * This function is called in the middle of a VMA split operation, with
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
+ * walks (except hardware and gup_fast()).
*/
+ vma_assert_write_locked(vma);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
if (addr & ~PUD_MASK) {
- /*
- * hugetlb_vm_op_split is called right before we attempt to
- * split the VMA. We will need to unshare PMDs in the old and
- * new VMAs, so let's unshare before we split.
- */
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
- hugetlb_unshare_pmds(vma, floor, ceil);
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
+ /*
+ * Locking:
+ * Use take_locks=false here.
+ * The file rmap lock is already held.
+ * The hugetlb VMA lock can't be taken when we already
+ * hold the file rmap lock, and we don't need it because
+ * its purpose is to synchronize against concurrent page
+ * table walks, which are not possible thanks to the
+ * locks held by our caller.
+ */
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
+ }
}
-
- return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7884,9 +7898,16 @@ void move_hugetlb_state(struct folio *ol
spin_unlock_irq(&hugetlb_lock);
}
+/*
+ * If @take_locks is false, the caller must ensure that no concurrent page table
+ * access can happen (except for gup_fast() and hardware page walks).
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
+ * concurrent page fault handling) and the file rmap lock.
+ */
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -7910,8 +7931,12 @@ static void hugetlb_unshare_pmds(struct
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
- hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (take_locks) {
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ }
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
@@ -7921,8 +7946,10 @@ static void hugetlb_unshare_pmds(struct
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
- hugetlb_vma_unlock_write(vma);
+ if (take_locks) {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7937,7 +7964,8 @@ static void hugetlb_unshare_pmds(struct
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
+ /* take_locks = */ true);
}
/*
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -516,7 +516,14 @@ __split_vma(struct vma_iterator *vmi, st
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -793,6 +793,8 @@ static inline void vma_adjust_trans_huge
(void)next;
}
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
+
static inline void vma_iter_free(struct vma_iterator *vmi)
{
mas_destroy(&vmi->mas);

View File

@@ -1,50 +0,0 @@
From e1280358284feaf844db5c6a76078b2c1738c5ae Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 27 May 2025 23:23:54 +0200
Subject: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race
huge_pmd_unshare() drops a reference on a page table that may have
previously been shared across processes, potentially turning it into a
normal page table used in another process in which unrelated VMAs can
afterwards be installed.
If this happens in the middle of a concurrent gup_fast(), gup_fast() could
end up walking the page tables of another process. While I don't see any
way in which that immediately leads to kernel memory corruption, it is
really weird and unexpected.
Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(),
just like we do in khugepaged when removing page tables for a THP
collapse.
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-2-1329349bad1a@google.com
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-2-f4136f5ec58a@google.com
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/hugetlb.c | 7 +++++++
1 file changed, 7 insertions(+)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7628,6 +7628,13 @@ int huge_pmd_unshare(struct mm_struct *m
return 0;
pud_clear(pud);
+ /*
+ * Once our caller drops the rmap lock, some other process might be
+ * using this page table as a normal, non-hugetlb page table.
+ * Wait for pending gup_fast() in other threads to finish before letting
+ * that happen.
+ */
+ tlb_remove_table_sync_one();
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
mm_dec_nr_pmds(mm);
return 1;

View File

@@ -1,48 +0,0 @@
From b36611870ea72c82eb78d90a017658394bdb9690 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 2 Jun 2025 10:49:26 -0700
Subject: mm/madvise: handle madvise_lock() failure during race unwinding
When unwinding race on -ERESTARTNOINTR handling of process_madvise(),
madvise_lock() failure is ignored. Check the failure and abort remaining
works in the case.
Link: https://lkml.kernel.org/r/20250602174926.1074-1-sj@kernel.org
Fixes: 4000e3d0a367 ("mm/madvise: remove redundant mmap_lock operations from process_madvise()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Barry Song <21cnbao@gmail.com>
Closes: https://lore.kernel.org/CAGsJ_4xJXXO0G+4BizhohSZ4yDteziPw43_uF8nPXPWxUVChzw@mail.gmail.com
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Barry Song <baohua@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/madvise.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1830,7 +1830,9 @@ static ssize_t vector_madvise(struct mm_
/* Drop and reacquire lock to unwind race. */
madvise_unlock(mm, behavior);
- madvise_lock(mm, behavior);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ goto out;
continue;
}
if (ret < 0)
@@ -1839,6 +1841,7 @@ static ssize_t vector_madvise(struct mm_
}
madvise_unlock(mm, behavior);
+out:
ret = (total_len - iov_iter_count(iter)) ? : ret;
return ret;

View File

@@ -1,164 +0,0 @@
From f0ab226d0eae3aa7e26524efc040026a65ead640 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 28 May 2025 10:02:08 +0200
Subject: video: screen_info: Relocate framebuffers behind PCI bridges
Apply PCI host-bridge window offsets to screen_info framebuffers. Fixes
invalid access to I/O memory.
Resources behind a PCI host bridge can be relocated by a certain offset
in the kernel's CPU address range used for I/O. The framebuffer memory
range stored in screen_info refers to the CPU addresses as seen during
boot (where the offset is 0). During boot up, firmware may assign a
different memory offset to the PCI host bridge and thereby relocating
the framebuffer address of the PCI graphics device as seen by the kernel.
The information in screen_info must be updated as well.
The helper pcibios_bus_to_resource() performs the relocation of the
screen_info's framebuffer resource (given in PCI bus addresses). The
result matches the I/O-memory resource of the PCI graphics device (given
in CPU addresses). As before, we store away the information necessary to
later update the information in screen_info itself.
Commit 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated
EFI framebuffers") added the code for updating screen_info. It is based
on similar functionality that pre-existed in efifb. Efifb uses a pointer
to the PCI resource, while the newer code does a memcpy of the region.
Hence efifb sees any updates to the PCI resource and avoids the issue.
v3:
- Only use struct pci_bus_region for PCI bus addresses (Bjorn)
- Clarify address semantics in commit messages and comments (Bjorn)
v2:
- Fixed tags (Takashi, Ivan)
- Updated information on efifb
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Reported-by: "Ivan T. Ivanov" <iivanov@suse.de>
Closes: https://bugzilla.suse.com/show_bug.cgi?id=1240696
Tested-by: "Ivan T. Ivanov" <iivanov@suse.de>
Fixes: 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated EFI framebuffers")
Cc: dri-devel@lists.freedesktop.org
Cc: <stable@vger.kernel.org> # v6.9+
Link: https://lore.kernel.org/r/20250528080234.7380-1-tzimmermann@suse.de
---
drivers/video/screen_info_pci.c | 79 +++++++++++++++++++++------------
1 file changed, 50 insertions(+), 29 deletions(-)
--- a/drivers/video/screen_info_pci.c
+++ b/drivers/video/screen_info_pci.c
@@ -7,8 +7,8 @@
static struct pci_dev *screen_info_lfb_pdev;
static size_t screen_info_lfb_bar;
-static resource_size_t screen_info_lfb_offset;
-static struct resource screen_info_lfb_res = DEFINE_RES_MEM(0, 0);
+static resource_size_t screen_info_lfb_res_start; // original start of resource
+static resource_size_t screen_info_lfb_offset; // framebuffer offset within resource
static bool __screen_info_relocation_is_valid(const struct screen_info *si, struct resource *pr)
{
@@ -31,7 +31,7 @@ void screen_info_apply_fixups(void)
if (screen_info_lfb_pdev) {
struct resource *pr = &screen_info_lfb_pdev->resource[screen_info_lfb_bar];
- if (pr->start != screen_info_lfb_res.start) {
+ if (pr->start != screen_info_lfb_res_start) {
if (__screen_info_relocation_is_valid(si, pr)) {
/*
* Only update base if we have an actual
@@ -47,46 +47,67 @@ void screen_info_apply_fixups(void)
}
}
+static int __screen_info_lfb_pci_bus_region(const struct screen_info *si, unsigned int type,
+ struct pci_bus_region *r)
+{
+ u64 base, size;
+
+ base = __screen_info_lfb_base(si);
+ if (!base)
+ return -EINVAL;
+
+ size = __screen_info_lfb_size(si, type);
+ if (!size)
+ return -EINVAL;
+
+ r->start = base;
+ r->end = base + size - 1;
+
+ return 0;
+}
+
static void screen_info_fixup_lfb(struct pci_dev *pdev)
{
unsigned int type;
- struct resource res[SCREEN_INFO_MAX_RESOURCES];
- size_t i, numres;
+ struct pci_bus_region bus_region;
int ret;
+ struct resource r = {
+ .flags = IORESOURCE_MEM,
+ };
+ const struct resource *pr;
const struct screen_info *si = &screen_info;
if (screen_info_lfb_pdev)
return; // already found
type = screen_info_video_type(si);
- if (type != VIDEO_TYPE_EFI)
- return; // only applies to EFI
+ if (!__screen_info_has_lfb(type))
+ return; // only applies to EFI; maybe VESA
- ret = screen_info_resources(si, res, ARRAY_SIZE(res));
+ ret = __screen_info_lfb_pci_bus_region(si, type, &bus_region);
if (ret < 0)
return;
- numres = ret;
- for (i = 0; i < numres; ++i) {
- struct resource *r = &res[i];
- const struct resource *pr;
-
- if (!(r->flags & IORESOURCE_MEM))
- continue;
- pr = pci_find_resource(pdev, r);
- if (!pr)
- continue;
-
- /*
- * We've found a PCI device with the framebuffer
- * resource. Store away the parameters to track
- * relocation of the framebuffer aperture.
- */
- screen_info_lfb_pdev = pdev;
- screen_info_lfb_bar = pr - pdev->resource;
- screen_info_lfb_offset = r->start - pr->start;
- memcpy(&screen_info_lfb_res, r, sizeof(screen_info_lfb_res));
- }
+ /*
+ * Translate the PCI bus address to resource. Account
+ * for an offset if the framebuffer is behind a PCI host
+ * bridge.
+ */
+ pcibios_bus_to_resource(pdev->bus, &r, &bus_region);
+
+ pr = pci_find_resource(pdev, &r);
+ if (!pr)
+ return;
+
+ /*
+ * We've found a PCI device with the framebuffer
+ * resource. Store away the parameters to track
+ * relocation of the framebuffer aperture.
+ */
+ screen_info_lfb_pdev = pdev;
+ screen_info_lfb_bar = pr - pdev->resource;
+ screen_info_lfb_offset = r.start - pr->start;
+ screen_info_lfb_res_start = bus_region.start;
}
DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID, PCI_BASE_CLASS_DISPLAY, 16,
screen_info_fixup_lfb);

View File

@@ -1,86 +0,0 @@
From 717bcb42b8cd4119c88249fbfc26d08e25a2ca24 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 3 Jun 2025 17:48:20 +0200
Subject: sysfb: Fix screen_info type check for VGA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Use the helper screen_info_video_type() to get the framebuffer
type from struct screen_info. Handle supported values in sorted
switch statement.
Reading orig_video_isVGA is unreliable. On most systems it is a
VIDEO_TYPE_ constant. On some systems with VGA it is simply set
to 1 to signal the presence of a VGA output. See vga_probe() for
an example. Retrieving the screen_info type with the helper
screen_info_video_type() detects these cases and returns the
appropriate VIDEO_TYPE_ constant. For VGA, sysfb creates a device
named "vga-framebuffer".
The sysfb code has been taken from vga16fb, where it likely didn't
work correctly either. With this bugfix applied, vga16fb loads for
compatible vga-framebuffer devices.
Fixes: 0db5b61e0dc0 ("fbdev/vga16fb: Create EGA/VGA devices in sysfb code")
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Javier Martinez Canillas <javierm@redhat.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Tzung-Bi Shih <tzungbi@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: "Uwe Kleine-König" <u.kleine-koenig@baylibre.com>
Cc: Zsolt Kajtar <soci@c64.rulez.org>
Cc: <stable@vger.kernel.org> # v6.1+
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://lore.kernel.org/r/20250603154838.401882-1-tzimmermann@suse.de
---
drivers/firmware/sysfb.c | 26 ++++++++++++++++++--------
1 file changed, 18 insertions(+), 8 deletions(-)
--- a/drivers/firmware/sysfb.c
+++ b/drivers/firmware/sysfb.c
@@ -143,6 +143,7 @@ static __init int sysfb_init(void)
{
struct screen_info *si = &screen_info;
struct device *parent;
+ unsigned int type;
struct simplefb_platform_data mode;
const char *name;
bool compatible;
@@ -170,17 +171,26 @@ static __init int sysfb_init(void)
goto put_device;
}
+ type = screen_info_video_type(si);
+
/* if the FB is incompatible, create a legacy framebuffer device */
- if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
- name = "efi-framebuffer";
- else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
- name = "vesa-framebuffer";
- else if (si->orig_video_isVGA == VIDEO_TYPE_VGAC)
- name = "vga-framebuffer";
- else if (si->orig_video_isVGA == VIDEO_TYPE_EGAC)
+ switch (type) {
+ case VIDEO_TYPE_EGAC:
name = "ega-framebuffer";
- else
+ break;
+ case VIDEO_TYPE_VGAC:
+ name = "vga-framebuffer";
+ break;
+ case VIDEO_TYPE_VLFB:
+ name = "vesa-framebuffer";
+ break;
+ case VIDEO_TYPE_EFI:
+ name = "efi-framebuffer";
+ break;
+ default:
name = "platform-framebuffer";
+ break;
+ }
pd = platform_device_alloc(name, 0);
if (!pd) {

View File

@@ -1,200 +0,0 @@
From 08b1e02fc44abc04d813dbc827812db9ebca0dad Mon Sep 17 00:00:00 2001
From: Luo Gengkun <luogengkun@huaweicloud.com>
Date: Mon, 21 Apr 2025 03:50:21 +0000
Subject: watchdog: fix watchdog may detect false positive of softlockup
When updating `watchdog_thresh`, there is a race condition between writing
the new `watchdog_thresh` value and stopping the old watchdog timer. If
the old timer triggers during this window, it may falsely detect a
softlockup due to the old interval and the new `watchdog_thresh` value
being used. The problem can be described as follow:
# We asuume previous watchdog_thresh is 60, so the watchdog timer is
# coming every 24s.
echo 10 > /proc/sys/kernel/watchdog_thresh (User space)
|
+------>+ update watchdog_thresh (We are in kernel now)
|
| # using old interval and new `watchdog_thresh`
+------>+ watchdog hrtimer (irq context: detect softlockup)
|
|
+-------+
|
|
+ softlockup_stop_all
To fix this problem, introduce a shadow variable for `watchdog_thresh`.
The update to the actual `watchdog_thresh` is delayed until after the old
timer is stopped, preventing false positives.
The following testcase may help to understand this problem.
---------------------------------------------
echo RT_RUNTIME_SHARE > /sys/kernel/debug/sched/features
echo -1 > /proc/sys/kernel/sched_rt_runtime_us
echo 0 > /sys/kernel/debug/sched/fair_server/cpu3/runtime
echo 60 > /proc/sys/kernel/watchdog_thresh
taskset -c 3 chrt -r 99 /bin/bash -c "while true;do true; done" &
echo 10 > /proc/sys/kernel/watchdog_thresh &
---------------------------------------------
The test case above first removes the throttling restrictions for
real-time tasks. It then sets watchdog_thresh to 60 and executes a
real-time task ,a simple while(1) loop, on cpu3. Consequently, the final
command gets blocked because the presence of this real-time thread
prevents kworker:3 from being selected by the scheduler. This eventually
triggers a softlockup detection on cpu3 due to watchdog_timer_fn operating
with inconsistent variable - using both the old interval and the updated
watchdog_thresh simultaneously.
[nysal@linux.ibm.com: fix the SOFTLOCKUP_DETECTOR=n case]
Link: https://lkml.kernel.org/r/20250502111120.282690-1-nysal@linux.ibm.com
Link: https://lkml.kernel.org/r/20250421035021.3507649-1-luogengkun@huaweicloud.com
Signed-off-by: Luo Gengkun <luogengkun@huaweicloud.com>
Signed-off-by: Nysal Jan K.A. <nysal@linux.ibm.com>
Cc: Doug Anderson <dianders@chromium.org>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Song Liu <song@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Nysal Jan K.A." <nysal@linux.ibm.com>
Cc: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
kernel/watchdog.c | 41 +++++++++++++++++++++++++++--------------
1 file changed, 27 insertions(+), 14 deletions(-)
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_thresh_next;
static int __read_mostly watchdog_hardlockup_available;
struct cpumask watchdog_cpumask __read_mostly;
@@ -870,12 +871,20 @@ int lockup_detector_offline_cpu(unsigned
return 0;
}
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
softlockup_stop_all();
+ /*
+ * To prevent watchdog_timer_fn from using the old interval and
+ * the new watchdog_thresh at the same time, which could lead to
+ * false softlockup reports, it is necessary to update the
+ * watchdog_thresh after the softlockup is completed.
+ */
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
@@ -888,7 +897,7 @@ static void __lockup_detector_reconfigur
void lockup_detector_reconfigure(void)
{
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
mutex_unlock(&watchdog_mutex);
}
@@ -908,27 +917,29 @@ static __init void lockup_detector_setup
return;
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
lockup_detector_update_enable();
watchdog_hardlockup_start();
cpus_read_unlock();
}
void lockup_detector_reconfigure(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
static inline void lockup_detector_setup(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
@@ -946,11 +957,11 @@ void lockup_detector_soft_poweroff(void)
#ifdef CONFIG_SYSCTL
/* Propagate any changes to the watchdog infrastructure */
-static void proc_watchdog_update(void)
+static void proc_watchdog_update(bool thresh_changed)
{
/* Remove impossible cpus to keep sysctl output clean. */
cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(thresh_changed);
}
/*
@@ -984,7 +995,7 @@ static int proc_watchdog_common(int whic
} else {
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!err && old != READ_ONCE(*param))
- proc_watchdog_update();
+ proc_watchdog_update(false);
}
mutex_unlock(&watchdog_mutex);
return err;
@@ -1035,11 +1046,13 @@ static int proc_watchdog_thresh(const st
mutex_lock(&watchdog_mutex);
- old = READ_ONCE(watchdog_thresh);
+ watchdog_thresh_next = READ_ONCE(watchdog_thresh);
+
+ old = watchdog_thresh_next;
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!err && write && old != READ_ONCE(watchdog_thresh))
- proc_watchdog_update();
+ if (!err && write && old != READ_ONCE(watchdog_thresh_next))
+ proc_watchdog_update(true);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1060,7 +1073,7 @@ static int proc_watchdog_cpumask(const s
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write)
- proc_watchdog_update();
+ proc_watchdog_update(false);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1080,7 +1093,7 @@ static const struct ctl_table watchdog_s
},
{
.procname = "watchdog_thresh",
- .data = &watchdog_thresh,
+ .data = &watchdog_thresh_next,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog_thresh,

View File

@@ -1,288 +0,0 @@
From ff8503c4997332bb5708c3b77f8a19f334e947a9 Mon Sep 17 00:00:00 2001
From: Harshit Agarwal <harshit@nutanix.com>
Date: Tue, 25 Feb 2025 18:05:53 +0000
Subject: sched/rt: Fix race in push_rt_task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Overview
========
When a CPU chooses to call push_rt_task and picks a task to push to
another CPU's runqueue then it will call find_lock_lowest_rq method
which would take a double lock on both CPUs' runqueues. If one of the
locks aren't readily available, it may lead to dropping the current
runqueue lock and reacquiring both the locks at once. During this window
it is possible that the task is already migrated and is running on some
other CPU. These cases are already handled. However, if the task is
migrated and has already been executed and another CPU is now trying to
wake it up (ttwu) such that it is queued again on the runqeue
(on_rq is 1) and also if the task was run by the same CPU, then the
current checks will pass even though the task was migrated out and is no
longer in the pushable tasks list.
Crashes
=======
This bug resulted in quite a few flavors of crashes triggering kernel
panics with various crash signatures such as assert failures, page
faults, null pointer dereferences, and queue corruption errors all
coming from scheduler itself.
Some of the crashes:
-> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? pick_next_task_rt+0x6e/0x1d0
? do_error_trap+0x64/0xa0
? pick_next_task_rt+0x6e/0x1d0
? exc_invalid_op+0x4c/0x60
? pick_next_task_rt+0x6e/0x1d0
? asm_exc_invalid_op+0x12/0x20
? pick_next_task_rt+0x6e/0x1d0
__schedule+0x5cb/0x790
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: kernel NULL pointer dereference, address: 00000000000000c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? __warn+0x8a/0xe0
? exc_page_fault+0x3d6/0x520
? asm_exc_page_fault+0x1e/0x30
? pick_next_task_rt+0xb5/0x1d0
? pick_next_task_rt+0x8c/0x1d0
__schedule+0x583/0x7e0
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: unable to handle page fault for address: ffff9464daea5900
kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p))
-> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? dequeue_top_rt_rq+0xa2/0xb0
? do_error_trap+0x64/0xa0
? dequeue_top_rt_rq+0xa2/0xb0
? exc_invalid_op+0x4c/0x60
? dequeue_top_rt_rq+0xa2/0xb0
? asm_exc_invalid_op+0x12/0x20
? dequeue_top_rt_rq+0xa2/0xb0
dequeue_rt_entity+0x1f/0x70
dequeue_task_rt+0x2d/0x70
__schedule+0x1a8/0x7e0
? blk_finish_plug+0x25/0x40
schedule+0x3c/0xb0
futex_wait_queue_me+0xb6/0x120
futex_wait+0xd9/0x240
do_futex+0x344/0xa90
? get_mm_exe_file+0x30/0x60
? audit_exe_compare+0x58/0x70
? audit_filter_rules.constprop.26+0x65e/0x1220
__x64_sys_futex+0x148/0x1f0
do_syscall_64+0x30/0x80
entry_SYSCALL_64_after_hwframe+0x62/0xc7
-> BUG: unable to handle page fault for address: ffff8cf3608bc2c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? spurious_kernel_fault+0x171/0x1c0
? exc_page_fault+0x3b6/0x520
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? asm_exc_page_fault+0x1e/0x30
? _cond_resched+0x15/0x30
? futex_wait_queue_me+0xc8/0x120
? futex_wait+0xd9/0x240
? try_to_wake_up+0x1b8/0x490
? futex_wake+0x78/0x160
? do_futex+0xcd/0xa90
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? plist_del+0x6a/0xd0
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? dequeue_pushable_task+0x20/0x70
? __schedule+0x382/0x7e0
? asm_sysvec_reschedule_ipi+0xa/0x20
? schedule+0x3c/0xb0
? exit_to_user_mode_prepare+0x9e/0x150
? irqentry_exit_to_user_mode+0x5/0x30
? asm_sysvec_reschedule_ipi+0x12/0x20
Above are some of the common examples of the crashes that were observed
due to this issue.
Details
=======
Let's look at the following scenario to understand this race.
1) CPU A enters push_rt_task
a) CPU A has chosen next_task = task p.
b) CPU A calls find_lock_lowest_rq(Task p, CPU Zs rq).
c) CPU A identifies CPU X as a destination CPU (X < Z).
d) CPU A enters double_lock_balance(CPU Zs rq, CPU Xs rq).
e) Since X is lower than Z, CPU A unlocks CPU Zs rq. Someone else has
locked CPU Xs rq, and thus, CPU A must wait.
2) At CPU Z
a) Previous task has completed execution and thus, CPU Z enters
schedule, locks its own rq after CPU A releases it.
b) CPU Z dequeues previous task and begins executing task p.
c) CPU Z unlocks its rq.
d) Task p yields the CPU (ex. by doing IO or waiting to acquire a
lock) which triggers the schedule function on CPU Z.
e) CPU Z enters schedule again, locks its own rq, and dequeues task p.
f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq.
3) At CPU B
a) CPU B enters try_to_wake_up with input task p.
b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates
B.state = WAKING.
c) CPU B via select_task_rq determines CPU Y as the target CPU.
4) The race
a) CPU A acquires CPU Xs lock and relocks CPU Z.
b) CPU A reads task p.cpu = Z and incorrectly concludes task p is
still on CPU Z.
c) CPU A failed to notice task p had been dequeued from CPU Z while
CPU A was waiting for locks in double_lock_balance. If CPU A knew
that task p had been dequeued, it would return NULL forcing
push_rt_task to give up the task p's migration.
d) CPU B updates task p.cpu = Y and calls ttwu_queue.
e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task
p.on_rq = 1.
f) CPU B unlocks CPU Y, triggering memory synchronization.
g) CPU A reads task p.on_rq = 1, cementing its assumption that task p
has not migrated.
h) CPU A decides to migrate p to CPU X.
This leads to A dequeuing p from Y's queue and various crashes down the
line.
Solution
========
The solution here is fairly simple. After obtaining the lock (at 4a),
the check is enhanced to make sure that the task is still at the head of
the pushable tasks list. If not, then it is anyway not suitable for
being pushed out.
Testing
=======
The fix is tested on a cluster of 3 nodes, where the panics due to this
are hit every couple of days. A fix similar to this was deployed on such
cluster and was stable for more than 30 days.
Co-developed-by: Jon Kohler <jon@nutanix.com>
Signed-off-by: Jon Kohler <jon@nutanix.com>
Co-developed-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
Signed-off-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
Co-developed-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
Signed-off-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
Signed-off-by: Harshit Agarwal <harshit@nutanix.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Tested-by: Will Ton <william.ton@nutanix.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20250225180553.167995-1-harshit@nutanix.com
---
kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 28 deletions(-)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1883,6 +1883,27 @@ static int find_lowest_rq(struct task_st
return -1;
}
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
{
@@ -1913,18 +1934,16 @@ static struct rq *find_lock_lowest_rq(st
/*
* We had to unlock the run queue. In
* the mean time, task could have
- * migrated already or had its affinity changed.
- * Also make sure that it wasn't scheduled on its rq.
+ * migrated already or had its affinity changed,
+ * therefore check if the task is still at the
+ * head of the pushable tasks list.
* It is possible the task was scheduled, set
* "migrate_disabled" and then got preempted, so we must
* check the task migration disable flag here too.
*/
- if (unlikely(task_rq(task) != rq ||
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !rt_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ task != pick_next_pushable_task(rq))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1944,27 +1963,6 @@ static struct rq *find_lock_lowest_rq(st
return lowest_rq;
}
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_tasks(rq))
- return NULL;
-
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(task_current_donor(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
-
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!rt_task(p));
-
- return p;
-}
-
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task

View File

@@ -1,62 +0,0 @@
From e02cbdc12bf63da363d7e3391376819241d67fbe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 28 Jan 2025 15:39:49 +0100
Subject: sched/fair: Adhere to place_entity() constraints
Mike reports that commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
placement bug causing scheduling lag") relies on commit 4423af84b297
("sched/fair: optimize the PLACE_LAG when se->vlag is zero") to not
trip a WARN in place_entity().
What happens is that the lag of the very last entity is 0 per
definition -- the average of one element matches the value of that
element. Therefore place_entity() will match the condition skipping
the lag adjustment:
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
Without the 'se->vlag' condition -- it will attempt to adjust the zero
lag even though we're inserting into an empty tree.
Notably, we should have failed the 'cfs_rq->nr_queued' condition, but
don't because they didn't get updated.
Additionally, move update_load_add() after placement() as is
consistent with other place_entity() users -- this change is
non-functional, place_entity() does not use cfs_rq->load.
Fixes: 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/c216eb4ef0e0e0029c600aefc69d56681cee5581.camel@gmx.de
---
kernel/sched/fair.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3808,6 +3808,7 @@ static void reweight_entity(struct cfs_r
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
se->rel_deadline = 1;
+ cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
@@ -3834,10 +3835,11 @@ static void reweight_entity(struct cfs_r
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
- update_load_add(&cfs_rq->load, se->load.weight);
place_entity(cfs_rq, se, 0);
+ update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
/*
* The entity's vruntime has been adjusted, so let's check

View File

@@ -1,184 +0,0 @@
From 7257e4f8df6b5783978ab06063fc8529ee2631d5 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 21 May 2025 09:06:02 -0700
Subject: alloc_tag: handle module codetag load errors as module load failures
Failures inside codetag_load_module() are currently ignored. As a result
an error there would not cause a module load failure and freeing of the
associated resources. Correct this behavior by propagating the error code
to the caller and handling possible errors. With this change, error to
allocate percpu counters, which happens at this stage, will not be ignored
and will cause a module load failure and freeing of resources. With this
change we also do not need to disable memory allocation profiling when
this error happens, instead we fail to load the module.
Link: https://lkml.kernel.org/r/20250521160602.1940771-1-surenb@google.com
Fixes: 10075262888b ("alloc_tag: allocate percpu counters for module tags dynamically")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: Casey Chen <cachen@purestorage.com>
Closes: https://lore.kernel.org/all/20250520231620.15259-1-cachen@purestorage.com/
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Wang <00107082@163.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/codetag.h | 8 ++++----
kernel/module/main.c | 5 +++--
lib/alloc_tag.c | 12 +++++++-----
lib/codetag.c | 34 +++++++++++++++++++++++++---------
4 files changed, 39 insertions(+), 20 deletions(-)
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -36,8 +36,8 @@ union codetag_ref {
struct codetag_type_desc {
const char *section;
size_t tag_size;
- void (*module_load)(struct module *mod,
- struct codetag *start, struct codetag *end);
+ int (*module_load)(struct module *mod,
+ struct codetag *start, struct codetag *end);
void (*module_unload)(struct module *mod,
struct codetag *start, struct codetag *end);
#ifdef CONFIG_MODULES
@@ -89,7 +89,7 @@ void *codetag_alloc_module_section(struc
unsigned long align);
void codetag_free_module_sections(struct module *mod);
void codetag_module_replaced(struct module *mod, struct module *new_mod);
-void codetag_load_module(struct module *mod);
+int codetag_load_module(struct module *mod);
void codetag_unload_module(struct module *mod);
#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
@@ -103,7 +103,7 @@ codetag_alloc_module_section(struct modu
unsigned long align) { return NULL; }
static inline void codetag_free_module_sections(struct module *mod) {}
static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {}
-static inline void codetag_load_module(struct module *mod) {}
+static inline int codetag_load_module(struct module *mod) { return 0; }
static inline void codetag_unload_module(struct module *mod) {}
#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3399,11 +3399,12 @@ static int load_module(struct load_info
goto sysfs_cleanup;
}
+ if (codetag_load_module(mod))
+ goto sysfs_cleanup;
+
/* Get rid of temporary copy. */
free_copy(info, flags);
- codetag_load_module(mod);
-
/* Done! */
trace_module_load(mod);
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -618,15 +618,16 @@ out:
mas_unlock(&mas);
}
-static void load_module(struct module *mod, struct codetag *start, struct codetag *stop)
+static int load_module(struct module *mod, struct codetag *start, struct codetag *stop)
{
/* Allocate module alloc_tag percpu counters */
struct alloc_tag *start_tag;
struct alloc_tag *stop_tag;
struct alloc_tag *tag;
+ /* percpu counters for core allocations are already statically allocated */
if (!mod)
- return;
+ return 0;
start_tag = ct_to_alloc_tag(start);
stop_tag = ct_to_alloc_tag(stop);
@@ -638,12 +639,13 @@ static void load_module(struct module *m
free_percpu(tag->counters);
tag->counters = NULL;
}
- shutdown_mem_profiling(true);
- pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s. Memory allocation profiling is disabled!\n",
+ pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n",
mod->name);
- break;
+ return -ENOMEM;
}
}
+
+ return 0;
}
static void replace_module(struct module *mod, struct module *new_mod)
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -167,6 +167,7 @@ static int codetag_module_init(struct co
{
struct codetag_range range;
struct codetag_module *cmod;
+ int mod_id;
int err;
range = get_section_range(mod, cttype->desc.section);
@@ -190,11 +191,20 @@ static int codetag_module_init(struct co
cmod->range = range;
down_write(&cttype->mod_lock);
- err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
- if (err >= 0) {
- cttype->count += range_size(cttype, &range);
- if (cttype->desc.module_load)
- cttype->desc.module_load(mod, range.start, range.stop);
+ mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
+ if (mod_id >= 0) {
+ if (cttype->desc.module_load) {
+ err = cttype->desc.module_load(mod, range.start, range.stop);
+ if (!err)
+ cttype->count += range_size(cttype, &range);
+ else
+ idr_remove(&cttype->mod_idr, mod_id);
+ } else {
+ cttype->count += range_size(cttype, &range);
+ err = 0;
+ }
+ } else {
+ err = mod_id;
}
up_write(&cttype->mod_lock);
@@ -295,17 +305,23 @@ void codetag_module_replaced(struct modu
mutex_unlock(&codetag_lock);
}
-void codetag_load_module(struct module *mod)
+int codetag_load_module(struct module *mod)
{
struct codetag_type *cttype;
+ int ret = 0;
if (!mod)
- return;
+ return 0;
mutex_lock(&codetag_lock);
- list_for_each_entry(cttype, &codetag_types, link)
- codetag_module_init(cttype, mod);
+ list_for_each_entry(cttype, &codetag_types, link) {
+ ret = codetag_module_init(cttype, mod);
+ if (ret)
+ break;
+ }
mutex_unlock(&codetag_lock);
+
+ return ret;
}
void codetag_unload_module(struct module *mod)

View File

@@ -1,29 +0,0 @@
From 57fdc30dcdad60e3b868682cc1e77083c091aef5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 27 Apr 2025 12:39:59 -0400
Subject: svcrdma: Unregister the device if svc_rdma_accept() fails
To handle device removal, svc_rdma_accept() requests removal
notification for the underlying device when accepting a connection.
However svc_rdma_free() is not invoked if svc_rdma_accept() fails.
There needs to be a matching "unregister" in that case; otherwise
the device cannot be removed.
Fixes: c4de97f7c454 ("svcrdma: Handle device removal outside of the CM event handler")
Cc: stable@vger.kernel.org
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 +
1 file changed, 1 insertion(+)
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -577,6 +577,7 @@ static struct svc_xprt *svc_rdma_accept(
if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
ib_destroy_qp(newxprt->sc_qp);
rdma_destroy_id(newxprt->sc_cm_id);
+ rpcrdma_rn_unregister(dev, &newxprt->sc_rn);
/* This call to put will destroy the transport */
svc_xprt_put(&newxprt->sc_xprt);
return NULL;

View File

@@ -1,53 +0,0 @@
From 92e99ba55ff0ce68ea7567331beda21861da2028 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 21 May 2025 16:34:13 -0400
Subject: SUNRPC: Prevent hang on NFS mount with xprtsec=[m]tls
Engineers at Hammerspace noticed that sometimes mounting with
"xprtsec=tls" hangs for a minute or so, and then times out, even
when the NFS server is reachable and responsive.
kTLS shuts off data_ready callbacks if strp->msg_ready is set to
mitigate data_ready callbacks when a full TLS record is not yet
ready to be read from the socket.
Normally msg_ready is clear when the first TLS record arrives on
a socket. However, I observed that sometimes tls_setsockopt() sets
strp->msg_ready, and that prevents forward progress because
tls_data_ready() becomes a no-op.
Moreover, Jakub says: "If there's a full record queued at the time
when [tlshd] passes the socket back to the kernel, it's up to the
reader to read the already queued data out." So SunRPC cannot
expect a data_ready call when ingress data is already waiting.
Add an explicit poll after SunRPC's upper transport is set up to
pick up any data that arrived after the TLS handshake but before
transport set-up is complete.
Reported-by: Steve Sears <sjs@hammerspace.com>
Suggested-by: Jakub Kacinski <kuba@kernel.org>
Fixes: 75eb6af7acdf ("SUNRPC: Add a TCP-with-TLS RPC transport class")
Tested-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Mike Snitzer <snitzer@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
net/sunrpc/xprtsock.c | 5 +++++
1 file changed, 5 insertions(+)
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2740,6 +2740,11 @@ static void xs_tcp_tls_setup_socket(stru
}
rpc_shutdown_client(lower_clnt);
+ /* Check for ingress data that arrived before the socket's
+ * ->data_ready callback was set up.
+ */
+ xs_poll_check_readable(upper_transport);
+
out_unlock:
current_restore_flags(pflags, PF_MEMALLOC);
upper_transport->clnt = NULL;

View File

@@ -1,89 +0,0 @@
From ac0c5ac5efecec7f731a1d80ec40ef3d34adc5ee Mon Sep 17 00:00:00 2001
From: Saurabh Sengar <ssengar@linux.microsoft.com>
Date: Thu, 29 May 2025 03:18:30 -0700
Subject: hv_netvsc: fix potential deadlock in netvsc_vf_setxdp()
The MANA driver's probe registers netdevice via the following call chain:
mana_probe()
register_netdev()
register_netdevice()
register_netdevice() calls notifier callback for netvsc driver,
holding the netdev mutex via netdev_lock_ops().
Further this netvsc notifier callback end up attempting to acquire the
same lock again in dev_xdp_propagate() leading to deadlock.
netvsc_netdev_event()
netvsc_vf_setxdp()
dev_xdp_propagate()
This deadlock was not observed so far because net_shaper_ops was never set,
and thus the lock was effectively a no-op in this case. Fix this by using
netif_xdp_propagate() instead of dev_xdp_propagate() to avoid recursive
locking in this path.
And, since no deadlock is observed on the other path which is via
netvsc_probe, add the lock exclusivly for that path.
Also, clean up the unregistration path by removing the unnecessary call to
netvsc_vf_setxdp(), since unregister_netdevice_many_notify() already
performs this cleanup via dev_xdp_uninstall().
Fixes: 97246d6d21c2 ("net: hold netdev instance lock during ndo_bpf")
Cc: stable@vger.kernel.org
Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Tested-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Subbaraya Sundeep <sbhatta@marvell.com>
Link: https://patch.msgid.link/1748513910-23963-1-git-send-email-ssengar@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
drivers/net/hyperv/netvsc_bpf.c | 2 +-
drivers/net/hyperv/netvsc_drv.c | 4 ++--
net/core/dev.c | 1 +
3 files changed, 4 insertions(+), 3 deletions(-)
--- a/drivers/net/hyperv/netvsc_bpf.c
+++ b/drivers/net/hyperv/netvsc_bpf.c
@@ -183,7 +183,7 @@ int netvsc_vf_setxdp(struct net_device *
xdp.command = XDP_SETUP_PROG;
xdp.prog = prog;
- ret = dev_xdp_propagate(vf_netdev, &xdp);
+ ret = netif_xdp_propagate(vf_netdev, &xdp);
if (ret && prog)
bpf_prog_put(prog);
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2462,8 +2462,6 @@ static int netvsc_unregister_vf(struct n
netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
- netvsc_vf_setxdp(vf_netdev, NULL);
-
reinit_completion(&net_device_ctx->vf_add);
netdev_rx_handler_unregister(vf_netdev);
netdev_upper_dev_unlink(vf_netdev, ndev);
@@ -2631,7 +2629,9 @@ static int netvsc_probe(struct hv_device
continue;
netvsc_prepare_bonding(vf_netdev);
+ netdev_lock_ops(vf_netdev);
netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE);
+ netdev_unlock_ops(vf_netdev);
__netvsc_vf_setup(net, vf_netdev);
break;
}
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9863,6 +9863,7 @@ int netif_xdp_propagate(struct net_devic
return dev->netdev_ops->ndo_bpf(dev, bpf);
}
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{

View File

@@ -1,113 +0,0 @@
From 485c82a86fb97fb86cac303348c85b6cf71fd787 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 9 Jun 2025 17:12:44 -0700
Subject: net: clear the dst when changing skb protocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A not-so-careful NAT46 BPF program can crash the kernel
if it indiscriminately flips ingress packets from v4 to v6:
BUG: kernel NULL pointer dereference, address: 0000000000000000
ip6_rcv_core (net/ipv6/ip6_input.c:190:20)
ipv6_rcv (net/ipv6/ip6_input.c:306:8)
process_backlog (net/core/dev.c:6186:4)
napi_poll (net/core/dev.c:6906:9)
net_rx_action (net/core/dev.c:7028:13)
do_softirq (kernel/softirq.c:462:3)
netif_rx (net/core/dev.c:5326:3)
dev_loopback_xmit (net/core/dev.c:4015:2)
ip_mc_finish_output (net/ipv4/ip_output.c:363:8)
NF_HOOK (./include/linux/netfilter.h:314:9)
ip_mc_output (net/ipv4/ip_output.c:400:5)
dst_output (./include/net/dst.h:459:9)
ip_local_out (net/ipv4/ip_output.c:130:9)
ip_send_skb (net/ipv4/ip_output.c:1496:8)
udp_send_skb (net/ipv4/udp.c:1040:8)
udp_sendmsg (net/ipv4/udp.c:1328:10)
The output interface has a 4->6 program attached at ingress.
We try to loop the multicast skb back to the sending socket.
Ingress BPF runs as part of netif_rx(), pushes a valid v6 hdr
and changes skb->protocol to v6. We enter ip6_rcv_core which
tries to use skb_dst(). But the dst is still an IPv4 one left
after IPv4 mcast output.
Clear the dst in all BPF helpers which change the protocol.
Try to preserve metadata dsts, those may carry non-routing
metadata.
Cc: stable@vger.kernel.org
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Fixes: d219df60a70e ("bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()")
Fixes: 1b00e0dfe7d0 ("bpf: update skb->protocol in bpf_skb_net_grow")
Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250610001245.1981782-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/core/filter.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3233,6 +3233,13 @@ static const struct bpf_func_proto bpf_s
.arg1_type = ARG_PTR_TO_CTX,
};
+static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
+{
+ skb->protocol = htons(proto);
+ if (skb_valid_dst(skb))
+ skb_dst_drop(skb);
+}
+
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
/* Caller already did skb_cow() with len as headroom,
@@ -3329,7 +3336,7 @@ static int bpf_skb_proto_4_to_6(struct s
}
}
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
skb_clear_hash(skb);
return 0;
@@ -3359,7 +3366,7 @@ static int bpf_skb_proto_6_to_4(struct s
}
}
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
skb_clear_hash(skb);
return 0;
@@ -3550,10 +3557,10 @@ static int bpf_skb_net_grow(struct sk_bu
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
}
if (skb_is_gso(skb)) {
@@ -3606,10 +3613,10 @@ static int bpf_skb_net_shrink(struct sk_
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);

View File

@@ -1,67 +0,0 @@
From 2bf1f4a3adcecc53c1012e460d1412cece3747ce Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Jun 2025 08:35:01 +0000
Subject: net_sched: sch_sfq: reject invalid perturb period
Gerrard Tai reported that SFQ perturb_period has no range check yet,
and this can be used to trigger a race condition fixed in a separate patch.
We want to make sure ctl->perturb_period * HZ will not overflow
and is positive.
Tested:
tc qd add dev lo root sfq perturb -10 # negative value : error
Error: sch_sfq: invalid perturb period.
tc qd add dev lo root sfq perturb 1000000000 # too big : error
Error: sch_sfq: invalid perturb period.
tc qd add dev lo root sfq perturb 2000000 # acceptable value
tc -s -d qd sh dev lo
qdisc sfq 8005: root refcnt 2 limit 127p quantum 64Kb depth 127 flows 128 divisor 1024 perturb 2000000sec
Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Gerrard Tai <gerrard.tai@starlabs.sg>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: stable@vger.kernel.org
Link: https://patch.msgid.link/20250611083501.1810459-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/sched/sch_sfq.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -656,6 +656,14 @@ static int sfq_change(struct Qdisc *sch,
NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
return -EINVAL;
}
+
+ if (ctl->perturb_period < 0 ||
+ ctl->perturb_period > INT_MAX / HZ) {
+ NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
+ return -EINVAL;
+ }
+ perturb_period = ctl->perturb_period * HZ;
+
if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
return -EINVAL;
@@ -672,14 +680,12 @@ static int sfq_change(struct Qdisc *sch,
headdrop = q->headdrop;
maxdepth = q->maxdepth;
maxflows = q->maxflows;
- perturb_period = q->perturb_period;
quantum = q->quantum;
flags = q->flags;
/* update and validate configuration */
if (ctl->quantum)
quantum = ctl->quantum;
- perturb_period = ctl->perturb_period * HZ;
if (ctl->flows)
maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
if (ctl->divisor) {

View File

@@ -1,93 +0,0 @@
From 90a5248443f925040b46e32fcf6715615c73e396 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 6 Jun 2025 13:50:32 +0100
Subject: mm/vma: reset VMA iterator on commit_merge() OOM failure
While an OOM failure in commit_merge() isn't really feasible due to the
allocation which might fail (a maple tree pre-allocation) being 'too small
to fail', we do need to handle this case correctly regardless.
In vma_merge_existing_range(), we can theoretically encounter failures
which result in an OOM error in two ways - firstly dup_anon_vma() might
fail with an OOM error, and secondly commit_merge() failing, ultimately,
to pre-allocate a maple tree node.
The abort logic for dup_anon_vma() resets the VMA iterator to the initial
range, ensuring that any logic looping on this iterator will correctly
proceed to the next VMA.
However the commit_merge() abort logic does not do the same thing. This
resulted in a syzbot report occurring because mlockall() iterates through
VMAs, is tolerant of errors, but ended up with an incorrect previous VMA
being specified due to incorrect iterator state.
While making this change, it became apparent we are duplicating logic -
the logic introduced in commit 41e6ddcaa0f1 ("mm/vma: add give_up_on_oom
option on modify/merge, use in uffd release") duplicates the
vmg->give_up_on_oom check in both abort branches.
Additionally, we observe that we can perform the anon_dup check safely on
dup_anon_vma() failure, as this will not be modified should this call
fail.
Finally, we need to reset the iterator in both cases, so now we can simply
use the exact same code to abort for both.
We remove the VM_WARN_ON(err != -ENOMEM) as it would be silly for this to
be otherwise and it allows us to implement the abort check more neatly.
Link: https://lkml.kernel.org/r/20250606125032.164249-1-lorenzo.stoakes@oracle.com
Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: syzbot+d16409ea9ecc16ed261a@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-mm/6842cc67.a00a0220.29ac89.003b.GAE@google.com/
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/vma.c | 22 ++++------------------
1 file changed, 4 insertions(+), 18 deletions(-)
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -927,26 +927,9 @@ static __must_check struct vm_area_struc
err = dup_anon_vma(next, middle, &anon_dup);
}
- if (err)
+ if (err || commit_merge(vmg))
goto abort;
- err = commit_merge(vmg);
- if (err) {
- VM_WARN_ON(err != -ENOMEM);
-
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
- /*
- * We've cleaned up any cloned anon_vma's, no VMAs have been
- * modified, no harm no foul if the user requests that we not
- * report this and just give up, leaving the VMAs unmerged.
- */
- if (!vmg->give_up_on_oom)
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- return NULL;
- }
-
khugepaged_enter_vma(vmg->target, vmg->flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
@@ -955,6 +938,9 @@ abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
/*
* This means we have failed to clone anon_vma's correctly, but no
* actual changes to VMAs have occurred, so no harm no foul - if the

View File

@@ -1,90 +0,0 @@
From 7c9d5350d8acfe1b876a8acabdf247b44a803d58 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 6 Jun 2025 10:28:07 +0100
Subject: mm: close theoretical race where stale TLB entries could linger
Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a
parallel reclaim leaving stale TLB entries") described a theoretical race
as such:
"""
Nadav Amit identified a theoretical race between page reclaim and mprotect
due to TLB flushes being batched outside of the PTL being held.
He described the race as follows:
CPU0 CPU1
---- ----
user accesses memory using RW PTE
[PTE now cached in TLB]
try_to_unmap_one()
==> ptep_get_and_clear()
==> set_tlb_ubc_flush_pending()
mprotect(addr, PROT_READ)
==> change_pte_range()
==> [ PTE non-present - no flush ]
user writes using cached RW PTE
...
try_to_unmap_flush()
The same type of race exists for reads when protecting for PROT_NONE and
also exists for operations that can leave an old TLB entry behind such as
munmap, mremap and madvise.
"""
The solution was to introduce flush_tlb_batched_pending() and call it
under the PTL from mprotect/madvise/munmap/mremap to complete any pending
tlb flushes.
However, while madvise_free_pte_range() and
madvise_cold_or_pageout_pte_range() were both retro-fitted to call
flush_tlb_batched_pending() immediately after initially acquiring the PTL,
they both temporarily release the PTL to split a large folio if they
stumble upon one. In this case, where re-acquiring the PTL
flush_tlb_batched_pending() must be called again, but it previously was
not. Let's fix that.
There are 2 Fixes: tags here: the first is the commit that fixed
madvise_free_pte_range(). The second is the commit that added
madvise_cold_or_pageout_pte_range(), which looks like it copy/pasted the
faulty pattern from madvise_free_pte_range().
This is a theoretical bug discovered during code review.
Link: https://lkml.kernel.org/r/20250606092809.4194056-1-ryan.roberts@arm.com
Fixes: 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries")
Fixes: 9c276cc65a58 ("mm: introduce MADV_COLD")
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Jann Horn <jannh@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/madvise.c | 2 ++
1 file changed, 2 insertions(+)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -503,6 +503,7 @@ restart:
pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;
@@ -736,6 +737,7 @@ static int madvise_free_pte_range(pmd_t
start_pte = pte;
if (!start_pte)
break;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
if (!err)
nr = 0;

View File

@@ -1,33 +0,0 @@
From 862a81c79f0bea8ede0352b637b44716f02f71b9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 13 Jun 2025 11:01:49 -0600
Subject: io_uring/kbuf: don't truncate end buffer for multiple buffer peeks
If peeking a bunch of buffers, normally io_ring_buffers_peek() will
truncate the end buffer. This isn't optimal as presumably more data will
be arriving later, and hence it's better to stop with the last full
buffer rather than truncate the end buffer.
Cc: stable@vger.kernel.org
Fixes: 35c8711c8fc4 ("io_uring/kbuf: add helpers for getting/peeking multiple buffers")
Reported-by: Christian Mazakas <christian.mazakas@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
io_uring/kbuf.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -270,8 +270,11 @@ static int io_ring_buffers_peek(struct i
/* truncate end piece, if needed, for non partial buffers */
if (len > arg->max_len) {
len = arg->max_len;
- if (!(bl->flags & IOBL_INC))
+ if (!(bl->flags & IOBL_INC)) {
+ if (iov != arg->iovs)
+ break;
buf->len = len;
+ }
}
iov->iov_base = u64_to_user_ptr(buf->addr);

View File

@@ -1,54 +0,0 @@
From bb3d761325a1707c8064a3d7dd556ed6a501a2e7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 13 Jun 2025 13:37:41 -0600
Subject: nvme: always punt polled uring_cmd end_io work to task_work
Currently NVMe uring_cmd completions will complete locally, if they are
polled. This is done because those completions are always invoked from
task context. And while that is true, there's no guarantee that it's
invoked under the right ring context, or even task. If someone does
NVMe passthrough via multiple threads and with a limited number of
poll queues, then ringA may find completions from ringB. For that case,
completing the request may not be sound.
Always just punt the passthrough completions via task_work, which will
redirect the completion, if needed.
Cc: stable@vger.kernel.org
Fixes: 585079b6e425 ("nvme: wire up async polling for io passthrough commands")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
drivers/nvme/host/ioctl.c | 21 +++++++--------------
1 file changed, 7 insertions(+), 14 deletions(-)
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -429,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
/*
- * For iopoll, complete it directly. Note that using the uring_cmd
- * helper for this is safe only because we check blk_rq_is_poll().
- * As that returns false if we're NOT on a polled queue, then it's
- * safe to use the polled completion helper.
- *
- * Otherwise, move the completion to task work.
+ * IOPOLL could potentially complete this request directly, but
+ * if multiple rings are polling on the same queue, then it's possible
+ * for one ring to find completions for another ring. Punting the
+ * completion via task_work will always direct it to the right
+ * location, rather than potentially complete requests for ringA
+ * under iopoll invocations from ringB.
*/
- if (blk_rq_is_poll(req)) {
- if (pdu->bio)
- blk_rq_unmap_user(pdu->bio);
- io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
- } else {
- io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
- }
-
+ io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
return RQ_END_IO_FREE;
}

View File

@@ -1,33 +0,0 @@
From a57621608b2cbcbd0c7da184e9012b9b111a8577 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 11 Jun 2025 09:59:15 +0900
Subject: block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion
When blk_zone_write_plug_bio_endio() is called for a regular write BIO
used to emulate a zone append operation, that is, a BIO flagged with
BIO_EMULATES_ZONE_APPEND, the BIO operation code is restored to the
original REQ_OP_ZONE_APPEND but the BIO_EMULATES_ZONE_APPEND flag is not
cleared. Clear it to fully return the BIO to its orginal definition.
Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250611005915.89843-1-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/blk-zoned.c | 1 +
1 file changed, 1 insertion(+)
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1225,6 +1225,7 @@ void blk_zone_write_plug_bio_endio(struc
if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= REQ_OP_ZONE_APPEND;
+ bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
}
/*

View File

@@ -1,65 +0,0 @@
From 7fc5a2cbcc8459cab6ae8c5dd1220768027ccb70 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 11 Jun 2025 08:48:46 -0600
Subject: block: use plug request list tail for one-shot backmerge attempt
Previously, the block layer stored the requests in the plug list in
LIFO order. For this reason, blk_attempt_plug_merge() would check
just the head entry for a back merge attempt, and abort after that
unless requests for multiple queues existed in the plug list. If more
than one request is present in the plug list, this makes the one-shot
back merging less useful than before, as it'll always fail to find a
quick merge candidate.
Use the tail entry for the one-shot merge attempt, which is the last
added request in the list. If that fails, abort immediately unless
there are multiple queues available. If multiple queues are available,
then scan the list. Ideally the latter scan would be a backwards scan
of the list, but as it currently stands, the plug list is singly linked
and hence this isn't easily feasible.
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/linux-block/20250611121626.7252-1-abuehaze@amazon.com/
Reported-by: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
Fixes: e70c301faece ("block: don't reorder requests in blk_add_rq_to_plug")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/blk-merge.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -1127,20 +1127,20 @@ bool blk_attempt_plug_merge(struct reque
if (!plug || rq_list_empty(&plug->mq_list))
return false;
- rq_list_for_each(&plug->mq_list, rq) {
- if (rq->q == q) {
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
- BIO_MERGE_OK)
- return true;
- break;
- }
+ rq = plug->mq_list.tail;
+ if (rq->q == q)
+ return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK;
+ else if (!plug->multiple_queues)
+ return false;
- /*
- * Only keep iterating plug list for merges if we have multiple
- * queues
- */
- if (!plug->multiple_queues)
- break;
+ rq_list_for_each(&plug->mq_list, rq) {
+ if (rq->q != q)
+ continue;
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK)
+ return true;
+ break;
}
return false;
}

View File

@@ -1,149 +0,0 @@
From 8ad4520fc849262ab23adbabebd366d4755035bc Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Tue, 3 Jun 2025 14:14:45 +0300
Subject: Revert "mm/execmem: Unify early execmem_cache behaviour"
The commit d6d1e3e6580c ("mm/execmem: Unify early execmem_cache
behaviour") changed early behaviour of execemem ROX cache to allow its
usage in early x86 code that allocates text pages when
CONFIG_MITGATION_ITS is enabled.
The permission management of the pages allocated from execmem for ITS
mitigation is now completely contained in arch/x86/kernel/alternatives.c
and therefore there is no need to special case early allocations in
execmem.
This reverts commit d6d1e3e6580ca35071ad474381f053cbf1fb6414.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-6-rppt@kernel.org
---
arch/x86/mm/init_32.c | 3 ---
arch/x86/mm/init_64.c | 3 ---
include/linux/execmem.h | 8 +-------
mm/execmem.c | 40 +++-------------------------------------
4 files changed, 4 insertions(+), 50 deletions(-)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -30,7 +30,6 @@
#include <linux/initrd.h>
#include <linux/cpumask.h>
#include <linux/gfp.h>
-#include <linux/execmem.h>
#include <asm/asm.h>
#include <asm/bios_ebda.h>
@@ -756,8 +755,6 @@ void mark_rodata_ro(void)
pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10);
- execmem_cache_make_ro();
-
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,7 +34,6 @@
#include <linux/gfp.h>
#include <linux/kcore.h>
#include <linux/bootmem_info.h>
-#include <linux/execmem.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
@@ -1392,8 +1391,6 @@ void mark_rodata_ro(void)
(end - start) >> 10);
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
- execmem_cache_make_ro();
-
kernel_set_to_readonly = 1;
/*
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -54,7 +54,7 @@ enum execmem_range_flags {
EXECMEM_ROX_CACHE = (1 << 1),
};
-#if defined(CONFIG_ARCH_HAS_EXECMEM_ROX) && defined(CONFIG_EXECMEM)
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
/**
* execmem_fill_trapping_insns - set memory to contain instructions that
* will trap
@@ -94,15 +94,9 @@ int execmem_make_temp_rw(void *ptr, size
* Return: 0 on success or negative error code on failure.
*/
int execmem_restore_rox(void *ptr, size_t size);
-
-/*
- * Called from mark_readonly(), where the system transitions to ROX.
- */
-void execmem_cache_make_ro(void);
#else
static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; }
static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; }
-static inline void execmem_cache_make_ro(void) { }
#endif
/**
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -254,34 +254,6 @@ out_unlock:
return ptr;
}
-static bool execmem_cache_rox = false;
-
-void execmem_cache_make_ro(void)
-{
- struct maple_tree *free_areas = &execmem_cache.free_areas;
- struct maple_tree *busy_areas = &execmem_cache.busy_areas;
- MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
- MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
- struct mutex *mutex = &execmem_cache.mutex;
- void *area;
-
- execmem_cache_rox = true;
-
- mutex_lock(mutex);
-
- mas_for_each(&mas_free, area, ULONG_MAX) {
- unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT;
- set_memory_ro(mas_free.index, pages);
- }
-
- mas_for_each(&mas_busy, area, ULONG_MAX) {
- unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT;
- set_memory_ro(mas_busy.index, pages);
- }
-
- mutex_unlock(mutex);
-}
-
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
@@ -302,15 +274,9 @@ static int execmem_cache_populate(struct
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
- if (execmem_cache_rox) {
- err = set_memory_rox((unsigned long)p, vm->nr_pages);
- if (err)
- goto err_free_mem;
- } else {
- err = set_memory_x((unsigned long)p, vm->nr_pages);
- if (err)
- goto err_free_mem;
- }
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
+ if (err)
+ goto err_free_mem;
err = execmem_cache_add(p, alloc_size);
if (err)

View File

@@ -1,63 +0,0 @@
From 85bfdd784bd61df94fd42daca141ed173f647e8c Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Sat, 7 Jun 2025 01:07:37 +1200
Subject: x86/virt/tdx: Avoid indirect calls to TDX assembly functions
Two 'static inline' TDX helper functions (sc_retry() and
sc_retry_prerr()) take function pointer arguments which refer to
assembly functions. Normally, the compiler inlines the TDX helper,
realizes that the function pointer targets are completely static --
thus can be resolved at compile time -- and generates direct call
instructions.
But, other times (like when CONFIG_CC_OPTIMIZE_FOR_SIZE=y), the
compiler declines to inline the helpers and will instead generate
indirect call instructions.
Indirect calls to assembly functions require special annotation (for
various Control Flow Integrity mechanisms). But TDX assembly
functions lack the special annotations and can only be called
directly.
Annotate both the helpers as '__always_inline' to prod the compiler
into maintaining the direct calls. There is no guarantee here, but
Peter has volunteered to report the compiler bug if this assumption
ever breaks[1].
Fixes: 1e66a7e27539 ("x86/virt/tdx: Handle SEAMCALL no entropy error in common code")
Fixes: df01f5ae07dd ("x86/virt/tdx: Add SEAMCALL error printing for module initialization")
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/lkml/20250605145914.GW39944@noisy.programming.kicks-ass.net/ [1]
Link: https://lore.kernel.org/all/20250606130737.30713-1-kai.huang%40intel.com
---
arch/x86/include/asm/tdx.h | 2 +-
arch/x86/virt/vmx/tdx/tdx.c | 5 +++--
2 files changed, 4 insertions(+), 3 deletions(-)
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -100,7 +100,7 @@ void tdx_init(void);
typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
-static inline u64 sc_retry(sc_func_t func, u64 fn,
+static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
struct tdx_module_args *args)
{
int retry = RDRAND_RETRY_LOOPS;
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -69,8 +69,9 @@ static inline void seamcall_err_ret(u64
args->r9, args->r10, args->r11);
}
-static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
- u64 fn, struct tdx_module_args *args)
+static __always_inline int sc_retry_prerr(sc_func_t func,
+ sc_err_func_t err_func,
+ u64 fn, struct tdx_module_args *args)
{
u64 sret = sc_retry(func, fn, args);

View File

@@ -1,31 +0,0 @@
From a94cf5c6e7e31be9d4788916ce847adb15735d81 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 3 Jun 2025 14:14:41 +0300
Subject: x86/mm/pat: don't collapse pages without PSE set
Collapsing pages to a leaf PMD or PUD should be done only if
X86_FEATURE_PSE is available, which is not the case when running e.g.
as a Xen PV guest.
Fixes: 41d88484c71c ("x86/mm/pat: restore large ROX pages after fragmentation")
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20250528123557.12847-3-jgross@suse.com
---
arch/x86/mm/pat/set_memory.c | 3 +++
1 file changed, 3 insertions(+)
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1257,6 +1257,9 @@ static int collapse_pmd_page(pmd_t *pmd,
pgprot_t pgprot;
int i = 0;
+ if (!cpu_feature_enabled(X86_FEATURE_PSE))
+ return 0;
+
addr &= PMD_MASK;
pte = pte_offset_kernel(pmd, addr);
first = *pte;

View File

@@ -1,34 +0,0 @@
From 8f28d595d167316469bb33b701e27b4b79c1aab1 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Tue, 3 Jun 2025 14:14:42 +0300
Subject: x86/Kconfig: only enable ROX cache in execmem when STRICT_MODULE_RWX
is set
Currently ROX cache in execmem is enabled regardless of
STRICT_MODULE_RWX setting. This breaks an assumption that module memory
is writable when STRICT_MODULE_RWX is disabled, for instance for kernel
debuggin.
Only enable ROX cache in execmem when STRICT_MODULE_RWX is set to
restore the original behaviour of module text permissions.
Fixes: 64f6a4e10c05 ("x86: re-enable EXECMEM_ROX support")
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-3-rppt@kernel.org
---
arch/x86/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,7 +88,7 @@ config X86
select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN
select ARCH_HAS_EARLY_DEBUG if KGDB
select ARCH_HAS_ELF_RANDOMIZE
- select ARCH_HAS_EXECMEM_ROX if X86_64
+ select ARCH_HAS_EXECMEM_ROX if X86_64 && STRICT_MODULE_RWX
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL

View File

@@ -1,110 +0,0 @@
From 24fd2e3cef1b98f4417b8015ba24a8a4dcaae0c1 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Tue, 3 Jun 2025 14:14:43 +0300
Subject: x86/its: move its_pages array to struct mod_arch_specific
The of pages with ITS thunks allocated for modules are tracked by an
array in 'struct module'.
Since this is very architecture specific data structure, move it to
'struct mod_arch_specific'.
No functional changes.
Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches")
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-4-rppt@kernel.org
---
arch/x86/include/asm/module.h | 8 ++++++++
arch/x86/kernel/alternative.c | 19 ++++++++++---------
include/linux/module.h | 5 -----
3 files changed, 18 insertions(+), 14 deletions(-)
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -5,12 +5,20 @@
#include <asm-generic/module.h>
#include <asm/orc_types.h>
+struct its_array {
+#ifdef CONFIG_MITIGATION_ITS
+ void **pages;
+ int num;
+#endif
+};
+
struct mod_arch_specific {
#ifdef CONFIG_UNWINDER_ORC
unsigned int num_orcs;
int *orc_unwind_ip;
struct orc_entry *orc_unwind;
#endif
+ struct its_array its_pages;
};
#endif /* _ASM_X86_MODULE_H */
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -195,8 +195,8 @@ void its_fini_mod(struct module *mod)
its_page = NULL;
mutex_unlock(&text_mutex);
- for (int i = 0; i < mod->its_num_pages; i++) {
- void *page = mod->its_page_array[i];
+ for (int i = 0; i < mod->arch.its_pages.num; i++) {
+ void *page = mod->arch.its_pages.pages[i];
execmem_restore_rox(page, PAGE_SIZE);
}
}
@@ -206,11 +206,11 @@ void its_free_mod(struct module *mod)
if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
return;
- for (int i = 0; i < mod->its_num_pages; i++) {
- void *page = mod->its_page_array[i];
+ for (int i = 0; i < mod->arch.its_pages.num; i++) {
+ void *page = mod->arch.its_pages.pages[i];
execmem_free(page);
}
- kfree(mod->its_page_array);
+ kfree(mod->arch.its_pages.pages);
}
#endif /* CONFIG_MODULES */
@@ -223,14 +223,15 @@ static void *its_alloc(void)
#ifdef CONFIG_MODULES
if (its_mod) {
- void *tmp = krealloc(its_mod->its_page_array,
- (its_mod->its_num_pages+1) * sizeof(void *),
+ struct its_array *pages = &its_mod->arch.its_pages;
+ void *tmp = krealloc(pages->pages,
+ (pages->num+1) * sizeof(void *),
GFP_KERNEL);
if (!tmp)
return NULL;
- its_mod->its_page_array = tmp;
- its_mod->its_page_array[its_mod->its_num_pages++] = page;
+ pages->pages = tmp;
+ pages->pages[pages->num++] = page;
execmem_make_temp_rw(page, PAGE_SIZE);
}
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -586,11 +586,6 @@ struct module {
atomic_t refcnt;
#endif
-#ifdef CONFIG_MITIGATION_ITS
- int its_num_pages;
- void **its_page_array;
-#endif
-
#ifdef CONFIG_CONSTRUCTORS
/* Constructor functions. */
ctor_fn_t *ctors;

View File

@@ -1,148 +0,0 @@
From 48d82c4dd03de376a6f673bda0f4f2b97138d855 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Tue, 3 Jun 2025 14:14:44 +0300
Subject: x86/its: explicitly manage permissions for ITS pages
execmem_alloc() sets permissions differently depending on the kernel
configuration, CPU support for PSE and whether a page is allocated
before or after mark_rodata_ro().
Add tracking for pages allocated for ITS when patching the core kernel
and make sure the permissions for ITS pages are explicitly managed for
both kernel and module allocations.
Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-5-rppt@kernel.org
---
arch/x86/kernel/alternative.c | 74 ++++++++++++++++++++++++-----------
1 file changed, 52 insertions(+), 22 deletions(-)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -138,6 +138,24 @@ static struct module *its_mod;
#endif
static void *its_page;
static unsigned int its_offset;
+struct its_array its_pages;
+
+static void *__its_alloc(struct its_array *pages)
+{
+ void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+ if (!page)
+ return NULL;
+
+ void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ pages->pages = tmp;
+ pages->pages[pages->num++] = page;
+
+ return no_free_ptr(page);
+}
/* Initialize a thunk with the "jmp *reg; int3" instructions. */
static void *its_init_thunk(void *thunk, int reg)
@@ -173,6 +191,21 @@ static void *its_init_thunk(void *thunk,
return thunk + offset;
}
+static void its_pages_protect(struct its_array *pages)
+{
+ for (int i = 0; i < pages->num; i++) {
+ void *page = pages->pages[i];
+ execmem_restore_rox(page, PAGE_SIZE);
+ }
+}
+
+static void its_fini_core(void)
+{
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+ its_pages_protect(&its_pages);
+ kfree(its_pages.pages);
+}
+
#ifdef CONFIG_MODULES
void its_init_mod(struct module *mod)
{
@@ -195,10 +228,8 @@ void its_fini_mod(struct module *mod)
its_page = NULL;
mutex_unlock(&text_mutex);
- for (int i = 0; i < mod->arch.its_pages.num; i++) {
- void *page = mod->arch.its_pages.pages[i];
- execmem_restore_rox(page, PAGE_SIZE);
- }
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ its_pages_protect(&mod->arch.its_pages);
}
void its_free_mod(struct module *mod)
@@ -216,28 +247,23 @@ void its_free_mod(struct module *mod)
static void *its_alloc(void)
{
- void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+ struct its_array *pages = &its_pages;
+ void *page;
+#ifdef CONFIG_MODULE
+ if (its_mod)
+ pages = &its_mod->arch.its_pages;
+#endif
+
+ page = __its_alloc(pages);
if (!page)
return NULL;
-#ifdef CONFIG_MODULES
- if (its_mod) {
- struct its_array *pages = &its_mod->arch.its_pages;
- void *tmp = krealloc(pages->pages,
- (pages->num+1) * sizeof(void *),
- GFP_KERNEL);
- if (!tmp)
- return NULL;
-
- pages->pages = tmp;
- pages->pages[pages->num++] = page;
+ execmem_make_temp_rw(page, PAGE_SIZE);
+ if (pages == &its_pages)
+ set_memory_x((unsigned long)page, 1);
- execmem_make_temp_rw(page, PAGE_SIZE);
- }
-#endif /* CONFIG_MODULES */
-
- return no_free_ptr(page);
+ return page;
}
static void *its_allocate_thunk(int reg)
@@ -291,7 +317,9 @@ u8 *its_static_thunk(int reg)
return thunk;
}
-#endif
+#else
+static inline void its_fini_core(void) {}
+#endif /* CONFIG_MITIGATION_ITS */
/*
* Nomenclature for variable names to simplify and clarify this code and ease
@@ -2368,6 +2396,8 @@ void __init alternative_instructions(voi
apply_retpolines(__retpoline_sites, __retpoline_sites_end);
apply_returns(__return_sites, __return_sites_end);
+ its_fini_core();
+
/*
* Adjust all CALL instructions to point to func()-10, including
* those in .altinstr_replacement.

View File

@@ -1,32 +0,0 @@
From 9bed8caa4c73f2d524d9600c74e6cbcff71c2456 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 29 Apr 2025 08:32:15 -0700
Subject: KVM: SVM: Clear current_vmcb during vCPU free for all *possible* CPUs
When freeing a vCPU and thus its VMCB, clear current_vmcb for all possible
CPUs, not just online CPUs, as it's theoretically possible a CPU could go
offline and come back online in conjunction with KVM reusing the page for
a new VMCB.
Link: https://lore.kernel.org/all/20250320013759.3965869-1-yosry.ahmed@linux.dev
Fixes: fd65d3142f73 ("kvm: svm: Ensure an IBPB on all affected CPUs when freeing a vmcb")
Cc: stable@vger.kernel.org
Cc: Jim Mattson <jmattson@google.com>
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
[sean: split to separate patch, write changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/svm/svm.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1488,7 +1488,7 @@ static void svm_clear_current_vmcb(struc
{
int i;
- for_each_online_cpu(i)
+ for_each_possible_cpu(i)
cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
}

View File

@@ -1,43 +0,0 @@
From d74cb6c8b70d9b5ad8482f4821679b83bad9de63 Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Mon, 24 Mar 2025 22:08:48 +0800
Subject: KVM: VMX: Flush shadow VMCS on emergency reboot
Ensure the shadow VMCS cache is evicted during an emergency reboot to
prevent potential memory corruption if the cache is evicted after reboot.
This issue was identified through code inspection, as __loaded_vmcs_clear()
flushes both the normal VMCS and the shadow VMCS.
Avoid checking the "launched" state during an emergency reboot, unlike the
behavior in __loaded_vmcs_clear(). This is important because reboot NMIs
can interfere with operations like copy_shadow_to_vmcs12(), where shadow
VMCSes are loaded directly using VMPTRLD. In such cases, if NMIs occur
right after the VMCS load, the shadow VMCSes will be active but the
"launched" state may not be set.
Fixes: 16f5b9034b69 ("KVM: nVMX: Copy processor-specific shadow-vmcs to VMCS12")
Cc: stable@vger.kernel.org
Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Link: https://lore.kernel.org/r/20250324140849.2099723-1-chao.gao@intel.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
arch/x86/kvm/vmx/vmx.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -769,8 +769,11 @@ void vmx_emergency_disable_virtualizatio
return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
+ loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
kvm_cpu_vmxoff();
}

View File

@@ -1,64 +0,0 @@
From 6e492900893c011cbe13fbb881cf1e11df08982b Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong@huawei.com>
Date: Wed, 18 Jun 2025 07:32:17 +0000
Subject: cgroup,freezer: fix incomplete freezing when attaching tasks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
An issue was found:
# cd /sys/fs/cgroup/freezer/
# mkdir test
# echo FROZEN > test/freezer.state
# cat test/freezer.state
FROZEN
# sleep 1000 &
[1] 863
# echo 863 > test/cgroup.procs
# cat test/freezer.state
FREEZING
When tasks are migrated to a frozen cgroup, the freezer fails to
immediately freeze the tasks, causing the cgroup to remain in the
"FREEZING".
The freeze_task() function is called before clearing the CGROUP_FROZEN
flag. This causes the freezing() check to incorrectly return false,
preventing __freeze_task() from being invoked for the migrated task.
To fix this issue, clear the CGROUP_FROZEN state before calling
freeze_task().
Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic")
Cc: stable@vger.kernel.org # v6.1+
Reported-by: Zhong Jiawei <zhongjiawei1@huawei.com>
Signed-off-by: Chen Ridong <chenridong@huawei.com>
Acked-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/cgroup/legacy_freezer.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 039d1eb2f215..507b8f19a262 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -188,13 +188,12 @@ static void freezer_attach(struct cgroup_taskset *tset)
if (!(freezer->state & CGROUP_FREEZING)) {
__thaw_task(task);
} else {
- freeze_task(task);
-
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
freezer = parent_freezer(freezer);
}
+ freeze_task(task);
}
}
--
2.50.0

View File

@@ -1,39 +0,0 @@
From ef4d2ebb50f1bd0d5b2e3f1aa2280d7d31e4a3c9 Mon Sep 17 00:00:00 2001
From: Maninder Singh <maninder1.s@samsung.com>
Date: Thu, 6 Mar 2025 14:50:06 +0530
Subject: NFSD: unregister filesystem in case genl_register_family() fails
With rpc_status netlink support, unregister of register_filesystem()
was missed in case of genl_register_family() fails.
Correcting it by making new label.
Fixes: bd9d6a3efa97 ("NFSD: add rpc_status netlink support")
Cc: stable@vger.kernel.org
Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/nfsctl.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2305,7 +2305,7 @@ static int __init init_nfsd(void)
goto out_free_cld;
retval = register_filesystem(&nfsd_fs_type);
if (retval)
- goto out_free_all;
+ goto out_free_nfsd4;
retval = genl_register_family(&nfsd_nl_family);
if (retval)
goto out_free_all;
@@ -2313,6 +2313,8 @@ static int __init init_nfsd(void)
return 0;
out_free_all:
+ unregister_filesystem(&nfsd_fs_type);
+out_free_nfsd4:
nfsd4_destroy_laundry_wq();
out_free_cld:
unregister_cld_notifier();

View File

@@ -1,162 +0,0 @@
From 6c2a6b3e27a3a02fd9f3f92458d4995014dfe69f Mon Sep 17 00:00:00 2001
From: Maninder Singh <maninder1.s@samsung.com>
Date: Thu, 6 Mar 2025 14:50:07 +0530
Subject: NFSD: fix race between nfsd registration and exports_proc
As of now nfsd calls create_proc_exports_entry() at start of init_nfsd
and cleanup by remove_proc_entry() at last of exit_nfsd.
Which causes kernel OOPs if there is race between below 2 operations:
(i) exportfs -r
(ii) mount -t nfsd none /proc/fs/nfsd
for 5.4 kernel ARM64:
CPU 1:
el1_irq+0xbc/0x180
arch_counter_get_cntvct+0x14/0x18
running_clock+0xc/0x18
preempt_count_add+0x88/0x110
prep_new_page+0xb0/0x220
get_page_from_freelist+0x2d8/0x1778
__alloc_pages_nodemask+0x15c/0xef0
__vmalloc_node_range+0x28c/0x478
__vmalloc_node_flags_caller+0x8c/0xb0
kvmalloc_node+0x88/0xe0
nfsd_init_net+0x6c/0x108 [nfsd]
ops_init+0x44/0x170
register_pernet_operations+0x114/0x270
register_pernet_subsys+0x34/0x50
init_nfsd+0xa8/0x718 [nfsd]
do_one_initcall+0x54/0x2e0
CPU 2 :
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
PC is at : exports_net_open+0x50/0x68 [nfsd]
Call trace:
exports_net_open+0x50/0x68 [nfsd]
exports_proc_open+0x2c/0x38 [nfsd]
proc_reg_open+0xb8/0x198
do_dentry_open+0x1c4/0x418
vfs_open+0x38/0x48
path_openat+0x28c/0xf18
do_filp_open+0x70/0xe8
do_sys_open+0x154/0x248
Sometimes it crashes at exports_net_open() and sometimes cache_seq_next_rcu().
and same is happening on latest 6.14 kernel as well:
[ 0.000000] Linux version 6.14.0-rc5-next-20250304-dirty
...
[ 285.455918] Unable to handle kernel paging request at virtual address 00001f4800001f48
...
[ 285.464902] pc : cache_seq_next_rcu+0x78/0xa4
...
[ 285.469695] Call trace:
[ 285.470083] cache_seq_next_rcu+0x78/0xa4 (P)
[ 285.470488] seq_read+0xe0/0x11c
[ 285.470675] proc_reg_read+0x9c/0xf0
[ 285.470874] vfs_read+0xc4/0x2fc
[ 285.471057] ksys_read+0x6c/0xf4
[ 285.471231] __arm64_sys_read+0x1c/0x28
[ 285.471428] invoke_syscall+0x44/0x100
[ 285.471633] el0_svc_common.constprop.0+0x40/0xe0
[ 285.471870] do_el0_svc_compat+0x1c/0x34
[ 285.472073] el0_svc_compat+0x2c/0x80
[ 285.472265] el0t_32_sync_handler+0x90/0x140
[ 285.472473] el0t_32_sync+0x19c/0x1a0
[ 285.472887] Code: f9400885 93407c23 937d7c27 11000421 (f86378a3)
[ 285.473422] ---[ end trace 0000000000000000 ]---
It reproduced simply with below script:
while [ 1 ]
do
/exportfs -r
done &
while [ 1 ]
do
insmod /nfsd.ko
mount -t nfsd none /proc/fs/nfsd
umount /proc/fs/nfsd
rmmod nfsd
done &
So exporting interfaces to user space shall be done at last and
cleanup at first place.
With change there is no Kernel OOPs.
Co-developed-by: Shubham Rana <s9.rana@samsung.com>
Signed-off-by: Shubham Rana <s9.rana@samsung.com>
Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/nfsctl.c | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2291,12 +2291,9 @@ static int __init init_nfsd(void)
if (retval)
goto out_free_pnfs;
nfsd_lockd_init(); /* lockd->nfsd callbacks */
- retval = create_proc_exports_entry();
- if (retval)
- goto out_free_lockd;
retval = register_pernet_subsys(&nfsd_net_ops);
if (retval < 0)
- goto out_free_exports;
+ goto out_free_lockd;
retval = register_cld_notifier();
if (retval)
goto out_free_subsys;
@@ -2308,11 +2305,16 @@ static int __init init_nfsd(void)
goto out_free_nfsd4;
retval = genl_register_family(&nfsd_nl_family);
if (retval)
+ goto out_free_filesystem;
+ retval = create_proc_exports_entry();
+ if (retval)
goto out_free_all;
nfsd_localio_ops_init();
return 0;
out_free_all:
+ genl_unregister_family(&nfsd_nl_family);
+out_free_filesystem:
unregister_filesystem(&nfsd_fs_type);
out_free_nfsd4:
nfsd4_destroy_laundry_wq();
@@ -2320,9 +2322,6 @@ out_free_cld:
unregister_cld_notifier();
out_free_subsys:
unregister_pernet_subsys(&nfsd_net_ops);
-out_free_exports:
- remove_proc_entry("fs/nfs/exports", NULL);
- remove_proc_entry("fs/nfs", NULL);
out_free_lockd:
nfsd_lockd_shutdown();
nfsd_drc_slab_free();
@@ -2335,14 +2334,14 @@ out_free_slabs:
static void __exit exit_nfsd(void)
{
+ remove_proc_entry("fs/nfs/exports", NULL);
+ remove_proc_entry("fs/nfs", NULL);
genl_unregister_family(&nfsd_nl_family);
unregister_filesystem(&nfsd_fs_type);
nfsd4_destroy_laundry_wq();
unregister_cld_notifier();
unregister_pernet_subsys(&nfsd_net_ops);
nfsd_drc_slab_free();
- remove_proc_entry("fs/nfs/exports", NULL);
- remove_proc_entry("fs/nfs", NULL);
nfsd_lockd_shutdown();
nfsd4_free_slabs();
nfsd4_exit_pnfs();

View File

@@ -1,35 +0,0 @@
From 0d4fc17cb5da09d14dbff91da7e28e50d3f54af2 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Fri, 21 Mar 2025 20:13:04 -0400
Subject: nfsd: fix access checking for NLM under XPRTSEC policies
When an export policy with xprtsec policy is set with "tls"
and/or "mtls", but an NFS client is doing a v3 xprtsec=tls
mount, then NLM locking calls fail with an error because
there is currently no support for NLM with TLS.
Until such support is added, allow NLM calls under TLS-secured
policy.
Fixes: 4cc9b9f2bf4d ("nfsd: refine and rename NFSD_MAY_LOCK")
Cc: stable@vger.kernel.org
Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Reviewed-by: NeilBrown <neil@brown.name>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/export.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1124,7 +1124,8 @@ __be32 check_nfsd_access(struct svc_expo
test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
goto ok;
}
- goto denied;
+ if (!may_bypass_gss)
+ goto denied;
ok:
/* legacy gss-only clients are always OK: */

View File

@@ -1,32 +0,0 @@
From 2fa924062a9494772cd997cb8b1ec572cfe6490f Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Fri, 28 Mar 2025 11:05:59 +1100
Subject: nfsd: nfsd4_spo_must_allow() must check this is a v4 compound request
If the request being processed is not a v4 compound request, then
examining the cstate can have undefined results.
This patch adds a check that the rpc procedure being executed
(rq_procinfo) is the NFSPROC4_COMPOUND procedure.
Reported-by: Olga Kornievskaia <okorniev@redhat.com>
Cc: stable@vger.kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/nfs4proc.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -3766,7 +3766,8 @@ bool nfsd4_spo_must_allow(struct svc_rqs
struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
u32 opiter;
- if (!cstate->minorversion)
+ if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] ||
+ cstate->minorversion == 0)
return false;
if (cstate->spo_must_allowed)

View File

@@ -1,47 +0,0 @@
From c860b8340bf921de66aa7871f40507dd5628926f Mon Sep 17 00:00:00 2001
From: Li Lingfeng <lilingfeng3@huawei.com>
Date: Mon, 14 Apr 2025 22:38:52 +0800
Subject: nfsd: Initialize ssc before laundromat_work to prevent NULL
dereference
In nfs4_state_start_net(), laundromat_work may access nfsd_ssc through
nfs4_laundromat -> nfsd4_ssc_expire_umount. If nfsd_ssc isn't initialized,
this can cause NULL pointer dereference.
Normally the delayed start of laundromat_work allows sufficient time for
nfsd_ssc initialization to complete. However, when the kernel waits too
long for userspace responses (e.g. in nfs4_state_start_net ->
nfsd4_end_grace -> nfsd4_record_grace_done -> nfsd4_cld_grace_done ->
cld_pipe_upcall -> __cld_pipe_upcall -> wait_for_completion path), the
delayed work may start before nfsd_ssc initialization finishes.
Fix this by moving nfsd_ssc initialization before starting laundromat_work.
Fixes: f4e44b393389 ("NFSD: delay unmount source's export after inter-server copy completed.")
Cc: stable@vger.kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/nfssvc.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -396,13 +396,13 @@ static int nfsd_startup_net(struct net *
if (ret)
goto out_filecache;
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+ nfsd4_ssc_init_umount_work(nn);
+#endif
ret = nfs4_state_start_net(net);
if (ret)
goto out_reply_cache;
-#ifdef CONFIG_NFSD_V4_2_INTER_SSC
- nfsd4_ssc_init_umount_work(nn);
-#endif
nn->nfsd_net_up = true;
return 0;

View File

@@ -1,62 +0,0 @@
From 01089ae8fff5bcc6e9949d50d76b70f2a16abe89 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 7 May 2025 10:45:15 -0400
Subject: NFSD: Implement FATTR4_CLONE_BLKSIZE attribute
RFC 7862 states that if an NFS server implements a CLONE operation,
it MUST also implement FATTR4_CLONE_BLKSIZE. NFSD implements CLONE,
but does not implement FATTR4_CLONE_BLKSIZE.
Note that in Section 12.2, RFC 7862 claims that
FATTR4_CLONE_BLKSIZE is RECOMMENDED, not REQUIRED. Likely this is
because a minor version is not permitted to add a REQUIRED
attribute. Confusing.
We assume this attribute reports a block size as a count of bytes,
as RFC 7862 does not specify a unit.
Reported-by: Roland Mainz <roland.mainz@nrubsig.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
Cc: stable@vger.kernel.org # v6.7+
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/nfs4xdr.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3391,6 +3391,23 @@ static __be32 nfsd4_encode_fattr4_suppat
return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
}
+/*
+ * Copied from generic_remap_checks/generic_remap_file_range_prep.
+ *
+ * These generic functions use the file system's s_blocksize, but
+ * individual file systems aren't required to use
+ * generic_remap_file_range_prep. Until there is a mechanism for
+ * determining a particular file system's (or file's) clone block
+ * size, this is the best NFSD can do.
+ */
+static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct inode *inode = d_inode(args->dentry);
+
+ return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize);
+}
+
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
const struct nfsd4_fattr_args *args)
@@ -3545,7 +3562,7 @@ static const nfsd4_enc_attr nfsd4_enc_fa
[FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop,
[FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat,
[FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop,
- [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize,
[FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop,
[FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop,

View File

@@ -1,65 +0,0 @@
From e0246422dfc08dec0fc3c96f3201bab6ceec6774 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Wed, 23 Apr 2025 15:22:50 +0200
Subject: fs/nfs/read: fix double-unlock bug in nfs_return_empty_folio()
Sometimes, when a file was read while it was being truncated by
another NFS client, the kernel could deadlock because folio_unlock()
was called twice, and the second call would XOR back the `PG_locked`
flag.
Most of the time (depending on the timing of the truncation), nobody
notices the problem because folio_unlock() gets called three times,
which flips `PG_locked` back off:
1. vfs_read, nfs_read_folio, ... nfs_read_add_folio,
nfs_return_empty_folio
2. vfs_read, nfs_read_folio, ... netfs_read_collection,
netfs_unlock_abandoned_read_pages
3. vfs_read, ... nfs_do_read_folio, nfs_read_add_folio,
nfs_return_empty_folio
The problem is that nfs_read_add_folio() is not supposed to unlock the
folio if fscache is enabled, and a nfs_netfs_folio_unlock() check is
missing in nfs_return_empty_folio().
Rarely this leads to a warning in netfs_read_collection():
------------[ cut here ]------------
R=0000031c: folio 10 is not locked
WARNING: CPU: 0 PID: 29 at fs/netfs/read_collect.c:133 netfs_read_collection+0x7c0/0xf00
[...]
Workqueue: events_unbound netfs_read_collection_worker
RIP: 0010:netfs_read_collection+0x7c0/0xf00
[...]
Call Trace:
<TASK>
netfs_read_collection_worker+0x67/0x80
process_one_work+0x12e/0x2c0
worker_thread+0x295/0x3a0
Most of the time, however, processes just get stuck forever in
folio_wait_bit_common(), waiting for `PG_locked` to disappear, which
never happens because nobody is really holding the folio lock.
Fixes: 000dbe0bec05 ("NFS: Convert buffered read paths to use netfs when fscache is enabled")
Cc: stable@vger.kernel.org
Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Reviewed-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
fs/nfs/read.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -56,7 +56,8 @@ static int nfs_return_empty_folio(struct
{
folio_zero_segment(folio, 0, folio_size(folio));
folio_mark_uptodate(folio);
- folio_unlock(folio);
+ if (nfs_netfs_folio_unlock(folio))
+ folio_unlock(folio);
return 0;
}

View File

@@ -1,32 +0,0 @@
From d9f4762296075cc67d9974d093a87064075853e1 Mon Sep 17 00:00:00 2001
From: Scott Mayhew <smayhew@redhat.com>
Date: Wed, 30 Apr 2025 07:12:29 -0400
Subject: NFSv4: Don't check for OPEN feature support in v4.1
fattr4_open_arguments is a v4.2 recommended attribute, so we shouldn't
be sending it to v4.1 servers.
Fixes: cb78f9b7d0c0 ("nfs: fix the fetch of FATTR4_OPEN_ARGUMENTS")
Signed-off-by: Scott Mayhew <smayhew@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Cc: stable@vger.kernel.org # 6.11+
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
fs/nfs/nfs4proc.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3976,8 +3976,9 @@ static int _nfs4_server_capabilities(str
FATTR4_WORD0_CASE_INSENSITIVE |
FATTR4_WORD0_CASE_PRESERVING;
if (minorversion)
- bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT |
- FATTR4_WORD2_OPEN_ARGUMENTS;
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+ if (minorversion > 1)
+ bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS;
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (status == 0) {

View File

@@ -1,96 +0,0 @@
From 7147868788966e9032cdeb0cf33bd1ae47785088 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Tue, 13 May 2025 12:08:31 -0400
Subject: NFS: always probe for LOCALIO support asynchronously
It was reported that NFS client mounts of AWS Elastic File System
(EFS) volumes is slow, this is because the AWS firewall disallows
LOCALIO (because it doesn't consider the use of NFS_LOCALIO_PROGRAM
valid), see: https://bugzilla.redhat.com/show_bug.cgi?id=2335129
Switch to performing the LOCALIO probe asynchronously to address the
potential for the NFS LOCALIO protocol being disallowed and/or slowed
by the remote server's response.
While at it, fix nfs_local_probe_async() to always take/put a
reference on the nfs_client that is using the LOCALIO protocol.
Also, unexport the nfs_local_probe() symbol and make it private to
fs/nfs/localio.c
This change has the side-effect of initially issuing reads, writes and
commits over the wire via SUNRPC until the LOCALIO probe completes.
Suggested-by: Jeff Layton <jlayton@kernel.org> # to always probe async
Fixes: 76d4cb6345da ("nfs: probe for LOCALIO when v4 client reconnects to server")
Cc: stable@vger.kernel.org # 6.14+
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
fs/nfs/client.c | 2 +-
fs/nfs/flexfilelayout/flexfilelayoutdev.c | 2 +-
fs/nfs/internal.h | 1 -
fs/nfs/localio.c | 6 ++++--
4 files changed, 6 insertions(+), 5 deletions(-)
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -439,7 +439,7 @@ struct nfs_client *nfs_get_client(const
spin_unlock(&nn->nfs_client_lock);
new = rpc_ops->init_client(new, cl_init);
if (!IS_ERR(new))
- nfs_local_probe(new);
+ nfs_local_probe_async(new);
return new;
}
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -400,7 +400,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_la
* keep ds_clp even if DS is local, so that if local IO cannot
* proceed somehow, we can fall back to NFS whenever we want.
*/
- nfs_local_probe(ds->ds_clp);
+ nfs_local_probe_async(ds->ds_clp);
max_payload =
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
NULL);
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -455,7 +455,6 @@ extern int nfs_wait_bit_killable(struct
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
/* localio.c */
-extern void nfs_local_probe(struct nfs_client *);
extern void nfs_local_probe_async(struct nfs_client *);
extern void nfs_local_probe_async_work(struct work_struct *);
extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -171,7 +171,7 @@ static bool nfs_server_uuid_is_local(str
* - called after alloc_client and init_client (so cl_rpcclient exists)
* - this function is idempotent, it can be called for old or new clients
*/
-void nfs_local_probe(struct nfs_client *clp)
+static void nfs_local_probe(struct nfs_client *clp)
{
/* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */
if (!localio_enabled ||
@@ -191,14 +191,16 @@ void nfs_local_probe(struct nfs_client *
nfs_localio_enable_client(clp);
nfs_uuid_end(&clp->cl_uuid);
}
-EXPORT_SYMBOL_GPL(nfs_local_probe);
void nfs_local_probe_async_work(struct work_struct *work)
{
struct nfs_client *clp =
container_of(work, struct nfs_client, cl_local_probe_work);
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return;
nfs_local_probe(clp);
+ nfs_put_client(clp);
}
void nfs_local_probe_async(struct nfs_client *clp)

View File

@@ -1,29 +0,0 @@
From 97831e31e43bb023d208b2344546a4e51e580dc6 Mon Sep 17 00:00:00 2001
From: Ruben Devos <devosruben6@gmail.com>
Date: Sun, 1 Jun 2025 19:18:55 +0200
Subject: smb: client: add NULL check in automount_fullpath
page is checked for null in __build_path_from_dentry_optional_prefix
when tcon->origin_fullpath is not set. However, the check is missing when
it is set.
Add a check to prevent a potential NULL pointer dereference.
Signed-off-by: Ruben Devos <devosruben6@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
fs/smb/client/namespace.c | 3 +++
1 file changed, 3 insertions(+)
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -146,6 +146,9 @@ static char *automount_fullpath(struct d
}
spin_unlock(&tcon->tc_lock);
+ if (unlikely(!page))
+ return ERR_PTR(-ENOMEM);
+
s = dentry_path_raw(dentry, page, PATH_MAX);
if (IS_ERR(s))
return s;

View File

@@ -1,39 +0,0 @@
From 0ca6d39b6d40b868eb6b4021f918de7a0f6a0f2e Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Mon, 2 Jun 2025 22:37:13 +0530
Subject: cifs: reset connections for all channels when reconnect requested
cifs_reconnect can be called with a flag to mark the session as needing
reconnect too. When this is done, we expect the connections of all
channels to be reconnected too, which is not happening today.
Without doing this, we have seen bad things happen when primary and
secondary channels are connected to different servers (in case of cloud
services like Azure Files SMB).
This change would force all connections to reconnect as well, not just
the sessions and tcons.
Cc: <stable@vger.kernel.org>
Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
fs/smb/client/connect.c | 7 +++++++
1 file changed, 7 insertions(+)
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -377,6 +377,13 @@ static int __cifs_reconnect(struct TCP_S
if (!cifs_tcp_ses_needs_reconnect(server, 1))
return 0;
+ /*
+ * if smb session has been marked for reconnect, also reconnect all
+ * connections. This way, the other connections do not end up bad.
+ */
+ if (mark_smb_session)
+ cifs_signal_cifsd_for_reconnect(server, mark_smb_session);
+
cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
cifs_abort_connection(server);

View File

@@ -1,31 +0,0 @@
From d1f84c6baebc480106c9558dea4842ecb3059017 Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Mon, 2 Jun 2025 22:37:14 +0530
Subject: cifs: update dstaddr whenever channel iface is updated
When the server interface info changes (more common in clustered
servers like Azure Files), the per-channel iface gets updated.
However, this did not update the corresponding dstaddr. As a result
these channels will still connect (or try connecting) to older addresses.
Fixes: b54034a73baf ("cifs: during reconnect, update interface if necessary")
Cc: <stable@vger.kernel.org>
Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
fs/smb/client/sess.c | 4 ++++
1 file changed, 4 insertions(+)
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -445,6 +445,10 @@ cifs_chan_update_iface(struct cifs_ses *
ses->chans[chan_index].iface = iface;
spin_unlock(&ses->chan_lock);
+
+ spin_lock(&server->srv_lock);
+ memcpy(&server->dstaddr, &iface->sockaddr, sizeof(server->dstaddr));
+ spin_unlock(&server->srv_lock);
}
static int

View File

@@ -1,33 +0,0 @@
From 2bffd71a70fa4695f62712688a720393cc92032b Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Mon, 2 Jun 2025 22:37:16 +0530
Subject: cifs: dns resolution is needed only for primary channel
When calling cifs_reconnect, before the connection to the
server is reestablished, the code today does a DNS resolution and
updates server->dstaddr.
However, this is not necessary for secondary channels. Secondary
channels use the interface list returned by the server to decide
which address to connect to. And that happens after tcon is reconnected
and server interfaces are requested.
Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
fs/smb/client/connect.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -392,7 +392,8 @@ static int __cifs_reconnect(struct TCP_S
try_to_freeze();
cifs_server_lock(server);
- if (!cifs_swn_set_server_dstaddr(server)) {
+ if (!cifs_swn_set_server_dstaddr(server) &&
+ !SERVER_IS_CHAN(server)) {
/* resolve the hostname again to make sure that IP address is up-to-date */
rc = reconn_set_ipaddr_from_hostname(server);
cifs_dbg(FYI, "%s: reconn_set_ipaddr_from_hostname: rc=%d\n", __func__, rc);

View File

@@ -1,73 +0,0 @@
From 918f494c058028cee8bdff33a4aa613377da61f0 Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Mon, 2 Jun 2025 22:37:12 +0530
Subject: cifs: deal with the channel loading lag while picking channels
Our current approach to select a channel for sending requests is this:
1. iterate all channels to find the min and max queue depth
2. if min and max are not the same, pick the channel with min depth
3. if min and max are same, round robin, as all channels are equally loaded
The problem with this approach is that there's a lag between selecting
a channel and sending the request (that increases the queue depth on the channel).
While these numbers will eventually catch up, there could be a skew in the
channel usage, depending on the application's I/O parallelism and the server's
speed of handling requests.
With sufficient parallelism, this lag can artificially increase the queue depth,
thereby impacting the performance negatively.
This change will change the step 1 above to start the iteration from the last
selected channel. This is to reduce the skew in channel usage even in the presence
of this lag.
Fixes: ea90708d3cf3 ("cifs: use the least loaded channel for sending requests")
Cc: <stable@vger.kernel.org>
Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
fs/smb/client/transport.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -1018,14 +1018,16 @@ struct TCP_Server_Info *cifs_pick_channe
uint index = 0;
unsigned int min_in_flight = UINT_MAX, max_in_flight = 0;
struct TCP_Server_Info *server = NULL;
- int i;
+ int i, start, cur;
if (!ses)
return NULL;
spin_lock(&ses->chan_lock);
+ start = atomic_inc_return(&ses->chan_seq);
for (i = 0; i < ses->chan_count; i++) {
- server = ses->chans[i].server;
+ cur = (start + i) % ses->chan_count;
+ server = ses->chans[cur].server;
if (!server || server->terminate)
continue;
@@ -1042,17 +1044,15 @@ struct TCP_Server_Info *cifs_pick_channe
*/
if (server->in_flight < min_in_flight) {
min_in_flight = server->in_flight;
- index = i;
+ index = cur;
}
if (server->in_flight > max_in_flight)
max_in_flight = server->in_flight;
}
/* if all channels are equally loaded, fall back to round-robin */
- if (min_in_flight == max_in_flight) {
- index = (uint)atomic_inc_return(&ses->chan_seq);
- index %= ses->chan_count;
- }
+ if (min_in_flight == max_in_flight)
+ index = (uint)start % ses->chan_count;
server = ses->chans[index].server;
spin_unlock(&ses->chan_lock);

View File

@@ -1,82 +0,0 @@
From 2cc6528030c91406031698e047896faa99fc0092 Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Mon, 2 Jun 2025 22:37:15 +0530
Subject: cifs: serialize other channels when query server interfaces is
pending
Today, during smb2_reconnect, session_mutex is released as soon as
the tcon is reconnected and is in a good state. However, in case
multichannel is enabled, there is also a query of server interfaces that
follows. We've seen that this query can race with reconnects of other
channels, causing them to step on each other with reconnects.
This change extends the hold of session_mutex till after the query of
server interfaces is complete. In order to avoid recursive smb2_reconnect
checks during query ioctl, this change also introduces a session flag
for sessions where such a query is in progress.
Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
fs/smb/client/cifsglob.h | 1 +
fs/smb/client/smb2pdu.c | 24 ++++++++++++++++++------
2 files changed, 19 insertions(+), 6 deletions(-)
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1084,6 +1084,7 @@ struct cifs_chan {
};
#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1)
+#define CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES (0x2)
/*
* Session structure. One of these for each uid session with a particular host
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -411,14 +411,19 @@ skip_sess_setup:
if (!rc &&
(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL) &&
server->ops->query_server_interfaces) {
- mutex_unlock(&ses->session_mutex);
-
/*
- * query server network interfaces, in case they change
+ * query server network interfaces, in case they change.
+ * Also mark the session as pending this update while the query
+ * is in progress. This will be used to avoid calling
+ * smb2_reconnect recursively.
*/
+ ses->flags |= CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES;
xid = get_xid();
rc = server->ops->query_server_interfaces(xid, tcon, false);
free_xid(xid);
+ ses->flags &= ~CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES;
+
+ mutex_unlock(&ses->session_mutex);
if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
/*
@@ -560,11 +565,18 @@ static int smb2_ioctl_req_init(u32 opcod
struct TCP_Server_Info *server,
void **request_buf, unsigned int *total_len)
{
- /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */
- if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) {
+ /*
+ * Skip reconnect in one of the following cases:
+ * 1. For FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs
+ * 2. For FSCTL_QUERY_NETWORK_INTERFACE_INFO IOCTL when called from
+ * smb2_reconnect (indicated by CIFS_SES_FLAG_SCALE_CHANNELS ses flag)
+ */
+ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO ||
+ (opcode == FSCTL_QUERY_NETWORK_INTERFACE_INFO &&
+ (tcon->ses->flags & CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES)))
return __smb2_plain_req_init(SMB2_IOCTL, tcon, server,
request_buf, total_len);
- }
+
return smb2_plain_req_init(SMB2_IOCTL, tcon, server,
request_buf, total_len);
}

View File

@@ -1,64 +0,0 @@
From 48fd713e7c35aba7a4c3ed327977897909575e3e Mon Sep 17 00:00:00 2001
From: Shyam Prasad N <sprasad@microsoft.com>
Date: Mon, 2 Jun 2025 22:37:17 +0530
Subject: cifs: do not disable interface polling on failure
When a server has multichannel enabled, we keep polling the server
for interfaces periodically. However, when this query fails, we
disable the polling. This can be problematic as it takes away the
chance for the server to start advertizing again.
This change reschedules the delayed work, even if the current call
failed. That way, multichannel sessions can recover.
Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
fs/smb/client/connect.c | 6 +-----
fs/smb/client/smb2pdu.c | 9 +++++----
2 files changed, 6 insertions(+), 9 deletions(-)
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -116,13 +116,9 @@ static void smb2_query_server_interfaces
rc = server->ops->query_server_interfaces(xid, tcon, false);
free_xid(xid);
- if (rc) {
- if (rc == -EOPNOTSUPP)
- return;
-
+ if (rc)
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
- }
queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
(SMB_INTERFACE_POLL_INTERVAL * HZ));
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -423,6 +423,10 @@ skip_sess_setup:
free_xid(xid);
ses->flags &= ~CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES;
+ /* regardless of rc value, setup polling */
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+
mutex_unlock(&ses->session_mutex);
if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
@@ -443,11 +447,8 @@ skip_sess_setup:
if (ses->chan_max > ses->chan_count &&
ses->iface_count &&
!SERVER_IS_CHAN(server)) {
- if (ses->chan_count == 1) {
+ if (ses->chan_count == 1)
cifs_server_dbg(VFS, "supports multichannel now\n");
- queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
- (SMB_INTERFACE_POLL_INTERVAL * HZ));
- }
cifs_try_adding_channels(ses);
}

View File

@@ -1,148 +0,0 @@
From 17457c5d0fa0b98cef9d2236a1518b1ded25fa5d Mon Sep 17 00:00:00 2001
From: Bharath SM <bharathsm.hsk@gmail.com>
Date: Wed, 11 Jun 2025 16:59:02 +0530
Subject: smb: improve directory cache reuse for readdir operations
Currently, cached directory contents were not reused across subsequent
'ls' operations because the cache validity check relied on comparing
the ctx pointer, which changes with each readdir invocation. As a
result, the cached dir entries was not marked as valid and the cache was
not utilized for subsequent 'ls' operations.
This change uses the file pointer, which remains consistent across all
readdir calls for a given directory instance, to associate and validate
the cache. As a result, cached directory contents can now be
correctly reused, improving performance for repeated directory listings.
Performance gains with local windows SMB server:
Without the patch and default actimeo=1:
1000 directory enumeration operations on dir with 10k files took 135.0s
With this patch and actimeo=0:
1000 directory enumeration operations on dir with 10k files took just 5.1s
Signed-off-by: Bharath SM <bharathsm@microsoft.com>
Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
fs/smb/client/cached_dir.h | 8 ++++----
fs/smb/client/readdir.c | 28 +++++++++++++++-------------
2 files changed, 19 insertions(+), 17 deletions(-)
--- a/fs/smb/client/cached_dir.h
+++ b/fs/smb/client/cached_dir.h
@@ -21,10 +21,10 @@ struct cached_dirent {
struct cached_dirents {
bool is_valid:1;
bool is_failed:1;
- struct dir_context *ctx; /*
- * Only used to make sure we only take entries
- * from a single context. Never dereferenced.
- */
+ struct file *file; /*
+ * Used to associate the cache with a single
+ * open file instance.
+ */
struct mutex de_mutex;
int pos; /* Expected ctx->pos */
struct list_head entries;
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -850,9 +850,9 @@ static bool emit_cached_dirents(struct c
}
static void update_cached_dirents_count(struct cached_dirents *cde,
- struct dir_context *ctx)
+ struct file *file)
{
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -861,9 +861,9 @@ static void update_cached_dirents_count(
}
static void finished_cached_dirents_count(struct cached_dirents *cde,
- struct dir_context *ctx)
+ struct dir_context *ctx, struct file *file)
{
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -876,11 +876,12 @@ static void finished_cached_dirents_coun
static void add_cached_dirent(struct cached_dirents *cde,
struct dir_context *ctx,
const char *name, int namelen,
- struct cifs_fattr *fattr)
+ struct cifs_fattr *fattr,
+ struct file *file)
{
struct cached_dirent *de;
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -910,7 +911,8 @@ static void add_cached_dirent(struct cac
static bool cifs_dir_emit(struct dir_context *ctx,
const char *name, int namelen,
struct cifs_fattr *fattr,
- struct cached_fid *cfid)
+ struct cached_fid *cfid,
+ struct file *file)
{
bool rc;
ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
@@ -922,7 +924,7 @@ static bool cifs_dir_emit(struct dir_con
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
add_cached_dirent(&cfid->dirents, ctx, name, namelen,
- fattr);
+ fattr, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
@@ -1022,7 +1024,7 @@ static int cifs_filldir(char *find_entry
cifs_prime_dcache(file_dentry(file), &name, &fattr);
return !cifs_dir_emit(ctx, name.name, name.len,
- &fattr, cfid);
+ &fattr, cfid, file);
}
@@ -1073,8 +1075,8 @@ int cifs_readdir(struct file *file, stru
* we need to initialize scanning and storing the
* directory content.
*/
- if (ctx->pos == 0 && cfid->dirents.ctx == NULL) {
- cfid->dirents.ctx = ctx;
+ if (ctx->pos == 0 && cfid->dirents.file == NULL) {
+ cfid->dirents.file = file;
cfid->dirents.pos = 2;
}
/*
@@ -1142,7 +1144,7 @@ int cifs_readdir(struct file *file, stru
} else {
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
- finished_cached_dirents_count(&cfid->dirents, ctx);
+ finished_cached_dirents_count(&cfid->dirents, ctx, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
cifs_dbg(FYI, "Could not find entry\n");
@@ -1183,7 +1185,7 @@ int cifs_readdir(struct file *file, stru
ctx->pos++;
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
- update_cached_dirents_count(&cfid->dirents, ctx);
+ update_cached_dirents_count(&cfid->dirents, file);
mutex_unlock(&cfid->dirents.de_mutex);
}

View File

@@ -1,45 +0,0 @@
From 9d330e139e9993f2489fcfe3048c8e737085646d Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Fri, 13 Jun 2025 10:12:43 +0900
Subject: ksmbd: fix null pointer dereference in destroy_previous_session
If client set ->PreviousSessionId on kerberos session setup stage,
NULL pointer dereference error will happen. Since sess->user is not
set yet, It can pass the user argument as NULL to destroy_previous_session.
sess->user will be set in ksmbd_krb5_authenticate(). So this patch move
calling destroy_previous_session() after ksmbd_krb5_authenticate().
Cc: stable@vger.kernel.org
Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-27391
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
fs/smb/server/smb2pdu.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1607,17 +1607,18 @@ static int krb5_authenticate(struct ksmb
out_len = work->response_sz -
(le16_to_cpu(rsp->SecurityBufferOffset) + 4);
- /* Check previous session */
- prev_sess_id = le64_to_cpu(req->PreviousSessionId);
- if (prev_sess_id && prev_sess_id != sess->id)
- destroy_previous_session(conn, sess->user, prev_sess_id);
-
retval = ksmbd_krb5_authenticate(sess, in_blob, in_len,
out_blob, &out_len);
if (retval) {
ksmbd_debug(SMB, "krb5 authentication failed\n");
return -EINVAL;
}
+
+ /* Check previous session */
+ prev_sess_id = le64_to_cpu(req->PreviousSessionId);
+ if (prev_sess_id && prev_sess_id != sess->id)
+ destroy_previous_session(conn, sess->user, prev_sess_id);
+
rsp->SecurityBufferLength = cpu_to_le16(out_len);
if ((conn->sign || server_conf.enforced_signing) ||

View File

@@ -32,7 +32,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
--- a/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -3994,6 +3994,7 @@ static int tcp_ack(struct sock *sk, cons @@ -4003,6 +4003,7 @@ static int tcp_ack(struct sock *sk, cons
prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
rs.prior_in_flight = tcp_packets_in_flight(tp); rs.prior_in_flight = tcp_packets_in_flight(tp);

View File

@@ -28,7 +28,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
struct tcp_congestion_ops { struct tcp_congestion_ops {
--- a/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -4084,6 +4084,7 @@ static int tcp_ack(struct sock *sk, cons @@ -4093,6 +4093,7 @@ static int tcp_ack(struct sock *sk, cons
delivered = tcp_newly_delivered(sk, delivered, flag); delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost; /* freshly marked lost */ lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);

View File

@@ -42,7 +42,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
*/ */
--- a/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -1139,7 +1139,12 @@ static void tcp_verify_retransmit_hint(s @@ -1135,7 +1135,12 @@ static void tcp_verify_retransmit_hint(s
*/ */
static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
{ {

View File

@@ -39,7 +39,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
--- a/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -1516,6 +1516,17 @@ static bool tcp_shifted_skb(struct sock @@ -1512,6 +1512,17 @@ static bool tcp_shifted_skb(struct sock
WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
tcp_skb_pcount_add(skb, -pcount); tcp_skb_pcount_add(skb, -pcount);

View File

@@ -54,7 +54,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
if (tcp_ca_needs_ecn(sk)) if (tcp_ca_needs_ecn(sk))
--- a/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -5800,13 +5800,14 @@ static void __tcp_ack_snd_check(struct s @@ -5809,13 +5809,14 @@ static void __tcp_ack_snd_check(struct s
/* More than one full frame received... */ /* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&

View File

@@ -35,7 +35,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */ /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
--- a/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -3881,6 +3881,7 @@ static void tcp_process_tlp_ack(struct s @@ -3890,6 +3890,7 @@ static void tcp_process_tlp_ack(struct s
/* ACK advances: there was a loss, so reduce cwnd. Reset /* ACK advances: there was a loss, so reduce cwnd. Reset
* tlp_high_seq in tcp_init_cwnd_reduction() * tlp_high_seq in tcp_init_cwnd_reduction()
*/ */

View File

@@ -31,7 +31,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
}; };
--- a/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -3864,7 +3864,8 @@ static int tcp_replace_ts_recent(struct @@ -3873,7 +3873,8 @@ static int tcp_replace_ts_recent(struct
/* This routine deals with acks during a TLP episode and ends an episode by /* This routine deals with acks during a TLP episode and ends an episode by
* resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
*/ */
@@ -41,7 +41,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
@@ -3892,6 +3893,11 @@ static void tcp_process_tlp_ack(struct s @@ -3901,6 +3902,11 @@ static void tcp_process_tlp_ack(struct s
FLAG_NOT_DUP | FLAG_DATA_SACKED))) { FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
/* Pure dupack: original and TLP probe arrived; no loss */ /* Pure dupack: original and TLP probe arrived; no loss */
tp->tlp_high_seq = 0; tp->tlp_high_seq = 0;
@@ -53,7 +53,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
} }
} }
@@ -4077,7 +4083,7 @@ static int tcp_ack(struct sock *sk, cons @@ -4086,7 +4092,7 @@ static int tcp_ack(struct sock *sk, cons
tcp_in_ack_event(sk, flag); tcp_in_ack_event(sk, flag);
if (tp->tlp_high_seq) if (tp->tlp_high_seq)
@@ -62,7 +62,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
if (tcp_ack_is_dubious(sk, flag)) { if (tcp_ack_is_dubious(sk, flag)) {
if (!(flag & (FLAG_SND_UNA_ADVANCED | if (!(flag & (FLAG_SND_UNA_ADVANCED |
@@ -4122,7 +4128,7 @@ no_queue: @@ -4131,7 +4137,7 @@ no_queue:
tcp_ack_probe(sk); tcp_ack_probe(sk);
if (tp->tlp_high_seq) if (tp->tlp_high_seq)

View File

@@ -83,7 +83,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
.maxlen = sizeof(u8), .maxlen = sizeof(u8),
--- a/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -5682,6 +5682,7 @@ static bool tcp_prune_ofo_queue(struct s @@ -5691,6 +5691,7 @@ static bool tcp_prune_ofo_queue(struct s
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
@@ -91,7 +91,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
@@ -5693,6 +5694,39 @@ static int tcp_prune_queue(struct sock * @@ -5702,6 +5703,39 @@ static int tcp_prune_queue(struct sock *
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0; return 0;
@@ -131,7 +131,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
tcp_collapse_ofo_queue(sk); tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue)) if (!skb_queue_empty(&sk->sk_receive_queue))
tcp_collapse(sk, &sk->sk_receive_queue, NULL, tcp_collapse(sk, &sk->sk_receive_queue, NULL,
@@ -5711,6 +5745,8 @@ static int tcp_prune_queue(struct sock * @@ -5720,6 +5754,8 @@ static int tcp_prune_queue(struct sock *
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0; return 0;

View File

@@ -180,7 +180,7 @@ Signed-off-by: Alexandre Frade <kernel@xanmod.org>
/* /*
* Some NVIDIA GPU devices do not work with bus reset, SBR needs to be * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
* prevented for those affected devices. * prevented for those affected devices.
@@ -5171,6 +5271,8 @@ static const struct pci_dev_acs_enabled @@ -5194,6 +5294,8 @@ static const struct pci_dev_acs_enabled
{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
/* Wangxun nics */ /* Wangxun nics */
{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },

View File

@@ -94,7 +94,7 @@ Contains:
-#endif -#endif
--- a/drivers/ata/ahci.c --- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c
@@ -1629,7 +1629,7 @@ static irqreturn_t ahci_thunderx_irq_han @@ -1662,7 +1662,7 @@ static irqreturn_t ahci_thunderx_irq_han
} }
#endif #endif
@@ -103,7 +103,7 @@ Contains:
struct ahci_host_priv *hpriv) struct ahci_host_priv *hpriv)
{ {
int i; int i;
@@ -1642,7 +1642,7 @@ static void ahci_remap_check(struct pci_ @@ -1675,7 +1675,7 @@ static void ahci_remap_check(struct pci_
pci_resource_len(pdev, bar) < SZ_512K || pci_resource_len(pdev, bar) < SZ_512K ||
bar != AHCI_PCI_BAR_STANDARD || bar != AHCI_PCI_BAR_STANDARD ||
!(readl(hpriv->mmio + AHCI_VSCAP) & 1)) !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
@@ -112,7 +112,7 @@ Contains:
cap = readq(hpriv->mmio + AHCI_REMAP_CAP); cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
for (i = 0; i < AHCI_MAX_REMAP; i++) { for (i = 0; i < AHCI_MAX_REMAP; i++) {
@@ -1657,18 +1657,11 @@ static void ahci_remap_check(struct pci_ @@ -1690,18 +1690,11 @@ static void ahci_remap_check(struct pci_
} }
if (!hpriv->remapped_nvme) if (!hpriv->remapped_nvme)
@@ -135,7 +135,7 @@ Contains:
} }
static int ahci_get_irq_vector(struct ata_host *host, int port) static int ahci_get_irq_vector(struct ata_host *host, int port)
@@ -1912,7 +1905,9 @@ static int ahci_init_one(struct pci_dev @@ -1945,7 +1938,9 @@ static int ahci_init_one(struct pci_dev
return -ENOMEM; return -ENOMEM;
/* detect remapped nvme devices */ /* detect remapped nvme devices */

79
debian/patches/series vendored
View File

@@ -129,9 +129,8 @@ misc-openwrt/0005-mac80211-minstrel_ht-reduce-fluctuations-in-rate-pro.patch
misc-openwrt/0006-mac80211-minstrel_ht-rework-rate-downgrade-code-and-.patch misc-openwrt/0006-mac80211-minstrel_ht-rework-rate-downgrade-code-and-.patch
misc-openwrt/0007-mac80211-increase-quantum-for-airtime-scheduler.patch misc-openwrt/0007-mac80211-increase-quantum-for-airtime-scheduler.patch
misc-openwrt/0008-mac80211-add-AQL-support-for-broadcast-packets.patch misc-openwrt/0008-mac80211-add-AQL-support-for-broadcast-packets.patch
misc-openwrt/0009-mac80211-revert-dynamically-set-codel-parameters-per-station.patch misc-openwrt/0009-mac80211-txq-tune.patch
misc-openwrt/0010-mac80211-txq-tune.patch misc-openwrt/0010-cfg80211-aql-txq-limit.patch
misc-openwrt/0011-cfg80211-aql-txq-limit.patch
misc-openwrt/0101-sched-sch_cake-fix-bulk-flow-accounting-logic-for-host.patch misc-openwrt/0101-sched-sch_cake-fix-bulk-flow-accounting-logic-for-host.patch
misc-openwrt/0201-fq-adjust-memory-size.patch misc-openwrt/0201-fq-adjust-memory-size.patch
@@ -140,26 +139,6 @@ patchset-pf/cpuidle/0001-cpuidle-Prefer-teo-over-menu-governor.patch
patchset-pf/kbuild/0001-ice-mark-ice_write_prof_mask_reg-as-noinline.patch patchset-pf/kbuild/0001-ice-mark-ice_write_prof_mask_reg-as-noinline.patch
patchset-pf/kbuild/0002-wifi-mac80211-mark-copy_mesh_setup-as-noinline.patch patchset-pf/kbuild/0002-wifi-mac80211-mark-copy_mesh_setup-as-noinline.patch
patchset-pf/nfs/0001-NFSD-unregister-filesystem-in-case-genl_register_fam.patch
patchset-pf/nfs/0002-NFSD-fix-race-between-nfsd-registration-and-exports_.patch
patchset-pf/nfs/0003-nfsd-fix-access-checking-for-NLM-under-XPRTSEC-polic.patch
patchset-pf/nfs/0004-nfsd-nfsd4_spo_must_allow-must-check-this-is-a-v4-co.patch
patchset-pf/nfs/0005-nfsd-Initialize-ssc-before-laundromat_work-to-preven.patch
patchset-pf/nfs/0006-NFSD-Implement-FATTR4_CLONE_BLKSIZE-attribute.patch
patchset-pf/nfs/0007-fs-nfs-read-fix-double-unlock-bug-in-nfs_return_empt.patch
patchset-pf/nfs/0008-NFSv4-Don-t-check-for-OPEN-feature-support-in-v4.1.patch
patchset-pf/nfs/0009-NFS-always-probe-for-LOCALIO-support-asynchronously.patch
patchset-pf/smb/0001-smb-client-add-NULL-check-in-automount_fullpath.patch
patchset-pf/smb/0002-cifs-reset-connections-for-all-channels-when-reconne.patch
patchset-pf/smb/0003-cifs-update-dstaddr-whenever-channel-iface-is-update.patch
patchset-pf/smb/0004-cifs-dns-resolution-is-needed-only-for-primary-chann.patch
patchset-pf/smb/0005-cifs-deal-with-the-channel-loading-lag-while-picking.patch
patchset-pf/smb/0006-cifs-serialize-other-channels-when-query-server-inte.patch
patchset-pf/smb/0007-cifs-do-not-disable-interface-polling-on-failure.patch
patchset-pf/smb/0008-smb-improve-directory-cache-reuse-for-readdir-operat.patch
patchset-pf/smb/0009-ksmbd-fix-null-pointer-dereference-in-destroy_previo.patch
patchset-xanmod/binder/0001-binder-turn-into-module.patch patchset-xanmod/binder/0001-binder-turn-into-module.patch
patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch patchset-xanmod/clearlinux/0001-sched-wait-Do-accept-in-LIFO-order-for-cache-efficie.patch
@@ -241,50 +220,16 @@ patchset-zen/sauce/0020-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.pat
patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch patchset-zen/sauce/0021-ZEN-INTERACTIVE-mm-swap-Disable-swap-in-readahead.patch
patchset-zen/sauce/0022-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch patchset-zen/sauce/0022-ZEN-INTERACTIVE-Document-PDS-BMQ-configuration.patch
patchset-pf/fixes/0001-mm-fix-ratelimit_pages-update-error-in-dirty_ratio_h.patch patchset-pf/fixes/0001-Revert-Disable-FOP_DONTCACHE-for-now-due-to-bugs.patch
patchset-pf/fixes/0002-vgacon-Add-check-for-vc_origin-address-range-in-vgac.patch patchset-pf/fixes/0002-mm-filemap-unify-read-write-dropbehind-naming.patch
patchset-pf/fixes/0003-fbdev-Fix-do_register_framebuffer-to-prevent-null-pt.patch patchset-pf/fixes/0003-mm-filemap-unify-dropbehind-flag-testing-and-clearin.patch
patchset-pf/fixes/0004-fbdev-Fix-fb_set_var-to-prevent-null-ptr-deref-in-fb.patch patchset-pf/fixes/0004-mm-khugepaged-fix-race-with-folio-split-free-using-t.patch
patchset-pf/fixes/0005-anon_inode-use-a-proper-mode-internally.patch patchset-pf/fixes/0005-mm-add-folio_expected_ref_count-for-reference-count-.patch
patchset-pf/fixes/0006-anon_inode-explicitly-block-setattr.patch patchset-pf/fixes/0006-drm-i915-snps_hdmi_pll-Fix-64-bit-divisor-truncation.patch
patchset-pf/fixes/0007-anon_inode-raise-SB_I_NODEV-and-SB_I_NOEXEC.patch patchset-pf/fixes/0007-mm-shmem-swap-fix-softlockup-with-mTHP-swapin.patch
patchset-pf/fixes/0008-fs-add-S_ANON_INODE.patch patchset-pf/fixes/0008-mm-gup-revert-mm-gup-fix-infinite-loop-within-__get_.patch
patchset-pf/fixes/0009-configfs-Do-not-override-creating-attribute-file-fai.patch patchset-pf/fixes/0009-mm-userfaultfd-fix-race-of-userfaultfd_move-and-swap.patch
patchset-pf/fixes/0010-Revert-Disable-FOP_DONTCACHE-for-now-due-to-bugs.patch patchset-pf/fixes/0010-dm-raid-fix-variable-in-journal-device-check.patch
patchset-pf/fixes/0011-mm-filemap-unify-read-write-dropbehind-naming.patch
patchset-pf/fixes/0012-mm-filemap-unify-dropbehind-flag-testing-and-clearin.patch
patchset-pf/fixes/0013-mm-khugepaged-fix-race-with-folio-split-free-using-t.patch
patchset-pf/fixes/0014-mm-add-folio_expected_ref_count-for-reference-count-.patch
patchset-pf/fixes/0015-mm-fix-uprobe-pte-be-overwritten-when-expanding-vma.patch
patchset-pf/fixes/0016-mm-hugetlb-unshare-page-tables-during-VMA-split-not-.patch
patchset-pf/fixes/0017-mm-hugetlb-fix-huge_pmd_unshare-vs-GUP-fast-race.patch
patchset-pf/fixes/0018-mm-madvise-handle-madvise_lock-failure-during-race-u.patch
patchset-pf/fixes/0019-video-screen_info-Relocate-framebuffers-behind-PCI-b.patch
patchset-pf/fixes/0020-sysfb-Fix-screen_info-type-check-for-VGA.patch
patchset-pf/fixes/0021-watchdog-fix-watchdog-may-detect-false-positive-of-s.patch
patchset-pf/fixes/0022-sched-rt-Fix-race-in-push_rt_task.patch
patchset-pf/fixes/0023-sched-fair-Adhere-to-place_entity-constraints.patch
patchset-pf/fixes/0024-alloc_tag-handle-module-codetag-load-errors-as-modul.patch
patchset-pf/fixes/0025-svcrdma-Unregister-the-device-if-svc_rdma_accept-fai.patch
patchset-pf/fixes/0026-SUNRPC-Prevent-hang-on-NFS-mount-with-xprtsec-m-tls.patch
patchset-pf/fixes/0027-hv_netvsc-fix-potential-deadlock-in-netvsc_vf_setxdp.patch
patchset-pf/fixes/0028-net-clear-the-dst-when-changing-skb-protocol.patch
patchset-pf/fixes/0029-net_sched-sch_sfq-reject-invalid-perturb-period.patch
patchset-pf/fixes/0030-mm-vma-reset-VMA-iterator-on-commit_merge-OOM-failur.patch
patchset-pf/fixes/0031-mm-close-theoretical-race-where-stale-TLB-entries-co.patch
patchset-pf/fixes/0032-io_uring-kbuf-don-t-truncate-end-buffer-for-multiple.patch
patchset-pf/fixes/0033-nvme-always-punt-polled-uring_cmd-end_io-work-to-tas.patch
patchset-pf/fixes/0034-block-Clear-BIO_EMULATES_ZONE_APPEND-flag-on-BIO-com.patch
patchset-pf/fixes/0035-block-use-plug-request-list-tail-for-one-shot-backme.patch
patchset-pf/fixes/0036-Revert-mm-execmem-Unify-early-execmem_cache-behaviou.patch
patchset-pf/fixes/0037-x86-virt-tdx-Avoid-indirect-calls-to-TDX-assembly-fu.patch
patchset-pf/fixes/0038-x86-mm-pat-don-t-collapse-pages-without-PSE-set.patch
patchset-pf/fixes/0039-x86-Kconfig-only-enable-ROX-cache-in-execmem-when-ST.patch
patchset-pf/fixes/0040-x86-its-move-its_pages-array-to-struct-mod_arch_spec.patch
patchset-pf/fixes/0041-x86-its-explicitly-manage-permissions-for-ITS-pages.patch
patchset-pf/fixes/0042-KVM-SVM-Clear-current_vmcb-during-vCPU-free-for-all-.patch
patchset-pf/fixes/0043-KVM-VMX-Flush-shadow-VMCS-on-emergency-reboot.patch
patchset-zen/fixes/0001-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch patchset-zen/fixes/0001-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch
patchset-zen/fixes/0002-x86-cpu-Help-users-notice-when-running-old-Intel-mic.patch patchset-zen/fixes/0002-x86-cpu-Help-users-notice-when-running-old-Intel-mic.patch
patchset-zen/fixes/0003-drm-i915-snps_hdmi_pll-Fix-64-bit-divisor-truncation.patch

3
debian/rules.real vendored
View File

@@ -225,7 +225,7 @@ define dh_binary_post
dh_lintian dh_lintian
dh_icons dh_icons
dh_link dh_link
dh_compress dh_compress $(DH_COMPRESS_ARGS)
dh_fixperms dh_fixperms
dh_missing dh_missing
dh_strip $(DH_STRIP_ARGS) -Xvmlinux -Xvmlinuz dh_strip $(DH_STRIP_ARGS) -Xvmlinux -Xvmlinuz
@@ -491,6 +491,7 @@ binary_perf build_perf: export _PYTHON_SYSCONFIGDATA_NAME = _sysconfigdata__$(DE
build_perf: $(STAMPS_DIR)/build-tools-headers build_perf: $(STAMPS_DIR)/build-tools-headers
$(call make-tools,tools/perf) $(call make-tools,tools/perf)
binary_perf: DH_COMPRESS_ARGS = -Xtips.txt
binary_perf: DH_SHLIBDEPS_ARGS = -Xperf-read-vdso binary_perf: DH_SHLIBDEPS_ARGS = -Xperf-read-vdso
binary_perf: build_perf binary_perf: build_perf
$(dh_binary_pre) $(dh_binary_pre)