release 6.15.2 (preliminary)
This commit is contained in:
@@ -1,27 +0,0 @@
|
||||
From cb40e98d75a75567cbd10f9fc69c2ec12c87a445 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Date: Wed, 5 Feb 2025 11:25:15 +0000
|
||||
Subject: cpufreq/amd-pstate: Remove the redundant des_perf clamping in
|
||||
adjust_perf
|
||||
|
||||
des_perf is later on clamped between min_perf and max_perf in
|
||||
amd_pstate_update. So, remove the redundant clamping from
|
||||
amd_pstate_adjust_perf.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 2 --
|
||||
1 file changed, 2 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -705,8 +705,6 @@ static void amd_pstate_adjust_perf(unsig
|
||||
if (max_perf < min_perf)
|
||||
max_perf = min_perf;
|
||||
|
||||
- des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
|
||||
-
|
||||
amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true,
|
||||
policy->governor->flags);
|
||||
cpufreq_cpu_put(policy);
|
@@ -1,133 +0,0 @@
|
||||
From f58e440e56a6c8a2c04894e5d169d1a98a8ce74f Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Date: Wed, 5 Feb 2025 11:25:18 +0000
|
||||
Subject: cpufreq/amd-pstate: Modularize perf<->freq conversion
|
||||
|
||||
Delegate the perf<->frequency conversion to helper functions to reduce
|
||||
code duplication, and improve readability.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 57 +++++++++++++++++++-----------------
|
||||
1 file changed, 30 insertions(+), 27 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -142,6 +142,20 @@ static struct quirk_entry quirk_amd_7k62
|
||||
.lowest_freq = 550,
|
||||
};
|
||||
|
||||
+static inline u8 freq_to_perf(struct amd_cpudata *cpudata, unsigned int freq_val)
|
||||
+{
|
||||
+ u8 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf,
|
||||
+ cpudata->nominal_freq);
|
||||
+
|
||||
+ return clamp_t(u8, perf_val, cpudata->lowest_perf, cpudata->highest_perf);
|
||||
+}
|
||||
+
|
||||
+static inline u32 perf_to_freq(struct amd_cpudata *cpudata, u8 perf_val)
|
||||
+{
|
||||
+ return DIV_ROUND_UP_ULL((u64)cpudata->nominal_freq * perf_val,
|
||||
+ cpudata->nominal_perf);
|
||||
+}
|
||||
+
|
||||
static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
|
||||
{
|
||||
/**
|
||||
@@ -534,7 +548,6 @@ static inline bool amd_pstate_sample(str
|
||||
static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf,
|
||||
u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags)
|
||||
{
|
||||
- unsigned long max_freq;
|
||||
struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu);
|
||||
u8 nominal_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
|
||||
@@ -543,8 +556,7 @@ static void amd_pstate_update(struct amd
|
||||
|
||||
des_perf = clamp_t(u8, des_perf, min_perf, max_perf);
|
||||
|
||||
- max_freq = READ_ONCE(cpudata->max_limit_freq);
|
||||
- policy->cur = div_u64(des_perf * max_freq, max_perf);
|
||||
+ policy->cur = perf_to_freq(cpudata, des_perf);
|
||||
|
||||
if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) {
|
||||
min_perf = des_perf;
|
||||
@@ -594,14 +606,11 @@ static int amd_pstate_verify(struct cpuf
|
||||
|
||||
static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
|
||||
{
|
||||
- u8 max_limit_perf, min_limit_perf, max_perf;
|
||||
- u32 max_freq;
|
||||
+ u8 max_limit_perf, min_limit_perf;
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
|
||||
- max_perf = READ_ONCE(cpudata->highest_perf);
|
||||
- max_freq = READ_ONCE(cpudata->max_freq);
|
||||
- max_limit_perf = div_u64(policy->max * max_perf, max_freq);
|
||||
- min_limit_perf = div_u64(policy->min * max_perf, max_freq);
|
||||
+ max_limit_perf = freq_to_perf(cpudata, policy->max);
|
||||
+ min_limit_perf = freq_to_perf(cpudata, policy->min);
|
||||
|
||||
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
min_limit_perf = min(cpudata->nominal_perf, max_limit_perf);
|
||||
@@ -619,21 +628,15 @@ static int amd_pstate_update_freq(struct
|
||||
{
|
||||
struct cpufreq_freqs freqs;
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- u8 des_perf, cap_perf;
|
||||
-
|
||||
- if (!cpudata->max_freq)
|
||||
- return -ENODEV;
|
||||
+ u8 des_perf;
|
||||
|
||||
if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
|
||||
amd_pstate_update_min_max_limit(policy);
|
||||
|
||||
- cap_perf = READ_ONCE(cpudata->highest_perf);
|
||||
-
|
||||
freqs.old = policy->cur;
|
||||
freqs.new = target_freq;
|
||||
|
||||
- des_perf = DIV_ROUND_CLOSEST(target_freq * cap_perf,
|
||||
- cpudata->max_freq);
|
||||
+ des_perf = freq_to_perf(cpudata, target_freq);
|
||||
|
||||
WARN_ON(fast_switch && !policy->fast_switch_enabled);
|
||||
/*
|
||||
@@ -907,7 +910,6 @@ static int amd_pstate_init_freq(struct a
|
||||
{
|
||||
int ret;
|
||||
u32 min_freq, max_freq;
|
||||
- u8 highest_perf, nominal_perf, lowest_nonlinear_perf;
|
||||
u32 nominal_freq, lowest_nonlinear_freq;
|
||||
struct cppc_perf_caps cppc_perf;
|
||||
|
||||
@@ -925,16 +927,17 @@ static int amd_pstate_init_freq(struct a
|
||||
else
|
||||
nominal_freq = cppc_perf.nominal_freq;
|
||||
|
||||
- highest_perf = READ_ONCE(cpudata->highest_perf);
|
||||
- nominal_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
- max_freq = div_u64((u64)highest_perf * nominal_freq, nominal_perf);
|
||||
-
|
||||
- lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
|
||||
- lowest_nonlinear_freq = div_u64((u64)nominal_freq * lowest_nonlinear_perf, nominal_perf);
|
||||
- WRITE_ONCE(cpudata->min_freq, min_freq * 1000);
|
||||
- WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq * 1000);
|
||||
- WRITE_ONCE(cpudata->nominal_freq, nominal_freq * 1000);
|
||||
- WRITE_ONCE(cpudata->max_freq, max_freq * 1000);
|
||||
+ min_freq *= 1000;
|
||||
+ nominal_freq *= 1000;
|
||||
+
|
||||
+ WRITE_ONCE(cpudata->nominal_freq, nominal_freq);
|
||||
+ WRITE_ONCE(cpudata->min_freq, min_freq);
|
||||
+
|
||||
+ max_freq = perf_to_freq(cpudata, cpudata->highest_perf);
|
||||
+ lowest_nonlinear_freq = perf_to_freq(cpudata, cpudata->lowest_nonlinear_perf);
|
||||
+
|
||||
+ WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq);
|
||||
+ WRITE_ONCE(cpudata->max_freq, max_freq);
|
||||
|
||||
/**
|
||||
* Below values need to be initialized correctly, otherwise driver will fail to load
|
@@ -1,37 +0,0 @@
|
||||
From 0a12d4a3ca1a996c1073d60c6775424972e8b7b9 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Date: Wed, 5 Feb 2025 11:25:19 +0000
|
||||
Subject: cpufreq/amd-pstate: Remove the unnecessary cpufreq_update_policy call
|
||||
|
||||
The update_limits callback is only called in two conditions.
|
||||
|
||||
* When the preferred core rankings change. In which case, we just need to
|
||||
change the prefcore ranking in the cpudata struct. As there are no changes
|
||||
to any of the perf values, there is no need to call cpufreq_update_policy()
|
||||
|
||||
* When the _PPC ACPI object changes, i.e. the highest allowed Pstate
|
||||
changes. The _PPC object is only used for a table based cpufreq driver
|
||||
like acpi-cpufreq, hence is irrelevant for CPPC based amd-pstate.
|
||||
|
||||
Hence, the cpufreq_update_policy() call becomes unnecessary and can be
|
||||
removed.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 4 ----
|
||||
1 file changed, 4 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -855,10 +855,6 @@ static void amd_pstate_update_limits(uns
|
||||
sched_set_itmt_core_prio((int)cur_high, cpu);
|
||||
}
|
||||
cpufreq_cpu_put(policy);
|
||||
-
|
||||
- if (!highest_perf_changed)
|
||||
- cpufreq_update_policy(cpu);
|
||||
-
|
||||
}
|
||||
|
||||
/*
|
@@ -1,124 +0,0 @@
|
||||
From ab0520499c83ff44d468f1b2b604c85e2f78d694 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Date: Wed, 5 Feb 2025 11:25:22 +0000
|
||||
Subject: cpufreq/amd-pstate: Use scope based cleanup for cpufreq_policy refs
|
||||
|
||||
There have been instances in past where refcount decrementing is missed
|
||||
while exiting a function. Use automatic scope based cleanup to avoid
|
||||
such errors.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 25 ++++++++-----------------
|
||||
include/linux/cpufreq.h | 3 +++
|
||||
2 files changed, 11 insertions(+), 17 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -548,7 +548,7 @@ static inline bool amd_pstate_sample(str
|
||||
static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf,
|
||||
u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags)
|
||||
{
|
||||
- struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu);
|
||||
+ struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu);
|
||||
u8 nominal_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
|
||||
if (!policy)
|
||||
@@ -574,8 +574,6 @@ static void amd_pstate_update(struct amd
|
||||
}
|
||||
|
||||
amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch);
|
||||
-
|
||||
- cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
|
||||
@@ -587,7 +585,8 @@ static int amd_pstate_verify(struct cpuf
|
||||
* amd-pstate qos_requests.
|
||||
*/
|
||||
if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
|
||||
- struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu);
|
||||
+ struct cpufreq_policy *policy __free(put_cpufreq_policy) =
|
||||
+ cpufreq_cpu_get(policy_data->cpu);
|
||||
struct amd_cpudata *cpudata;
|
||||
|
||||
if (!policy)
|
||||
@@ -595,7 +594,6 @@ static int amd_pstate_verify(struct cpuf
|
||||
|
||||
cpudata = policy->driver_data;
|
||||
policy_data->min = cpudata->lowest_nonlinear_freq;
|
||||
- cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
cpufreq_verify_within_cpu_limits(policy_data);
|
||||
@@ -678,7 +676,7 @@ static void amd_pstate_adjust_perf(unsig
|
||||
unsigned long capacity)
|
||||
{
|
||||
u8 max_perf, min_perf, des_perf, cap_perf, min_limit_perf;
|
||||
- struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
|
||||
+ struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu);
|
||||
struct amd_cpudata *cpudata;
|
||||
|
||||
if (!policy)
|
||||
@@ -710,7 +708,6 @@ static void amd_pstate_adjust_perf(unsig
|
||||
|
||||
amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true,
|
||||
policy->governor->flags);
|
||||
- cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on)
|
||||
@@ -823,28 +820,23 @@ static void amd_pstate_init_prefcore(str
|
||||
|
||||
static void amd_pstate_update_limits(unsigned int cpu)
|
||||
{
|
||||
- struct cpufreq_policy *policy = NULL;
|
||||
+ struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu);
|
||||
struct amd_cpudata *cpudata;
|
||||
u32 prev_high = 0, cur_high = 0;
|
||||
- int ret;
|
||||
bool highest_perf_changed = false;
|
||||
|
||||
if (!amd_pstate_prefcore)
|
||||
return;
|
||||
|
||||
- policy = cpufreq_cpu_get(cpu);
|
||||
if (!policy)
|
||||
return;
|
||||
|
||||
- cpudata = policy->driver_data;
|
||||
-
|
||||
guard(mutex)(&amd_pstate_driver_lock);
|
||||
|
||||
- ret = amd_get_highest_perf(cpu, &cur_high);
|
||||
- if (ret) {
|
||||
- cpufreq_cpu_put(policy);
|
||||
+ if (amd_get_highest_perf(cpu, &cur_high))
|
||||
return;
|
||||
- }
|
||||
+
|
||||
+ cpudata = policy->driver_data;
|
||||
|
||||
prev_high = READ_ONCE(cpudata->prefcore_ranking);
|
||||
highest_perf_changed = (prev_high != cur_high);
|
||||
@@ -854,7 +846,6 @@ static void amd_pstate_update_limits(uns
|
||||
if (cur_high < CPPC_MAX_PERF)
|
||||
sched_set_itmt_core_prio((int)cur_high, cpu);
|
||||
}
|
||||
- cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
/*
|
||||
--- a/include/linux/cpufreq.h
|
||||
+++ b/include/linux/cpufreq.h
|
||||
@@ -213,6 +213,9 @@ static inline struct cpufreq_policy *cpu
|
||||
static inline void cpufreq_cpu_put(struct cpufreq_policy *policy) { }
|
||||
#endif
|
||||
|
||||
+/* Scope based cleanup macro for cpufreq_policy kobject reference counting */
|
||||
+DEFINE_FREE(put_cpufreq_policy, struct cpufreq_policy *, if (_T) cpufreq_cpu_put(_T))
|
||||
+
|
||||
static inline bool policy_is_inactive(struct cpufreq_policy *policy)
|
||||
{
|
||||
return cpumask_empty(policy->cpus);
|
@@ -1,26 +0,0 @@
|
||||
From 658a4b7a41583e3b73477c0fbbee07aa6d6f7e0e Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Date: Wed, 5 Feb 2025 11:25:23 +0000
|
||||
Subject: cpufreq/amd-pstate: Remove the unncecessary driver_lock in
|
||||
amd_pstate_update_limits
|
||||
|
||||
There is no need to take a driver wide lock while updating the
|
||||
highest_perf value in the percpu cpudata struct. Hence remove it.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 2 --
|
||||
1 file changed, 2 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -831,8 +831,6 @@ static void amd_pstate_update_limits(uns
|
||||
if (!policy)
|
||||
return;
|
||||
|
||||
- guard(mutex)(&amd_pstate_driver_lock);
|
||||
-
|
||||
if (amd_get_highest_perf(cpu, &cur_high))
|
||||
return;
|
||||
|
@@ -1,35 +0,0 @@
|
||||
From 20f8507de83bc844c6ff2329e61ffc37734364e9 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Date: Sat, 22 Feb 2025 03:32:22 +0000
|
||||
Subject: cpufreq/amd-pstate: Fix the clamping of perf values
|
||||
|
||||
The clamping in freq_to_perf() is broken right now, as we first typecast
|
||||
(read wraparound) the overflowing value into a u8 and then clamp it down.
|
||||
So, use a u32 to store the >255 value in certain edge cases and then clamp
|
||||
it down into a u8.
|
||||
|
||||
Also, use a "explicit typecast + clamp" instead of just a "clamp_t" as the
|
||||
latter typecasts first and then clamps between the limits, which defeats
|
||||
our purpose.
|
||||
|
||||
Fixes: 305621eb6a8b ("cpufreq/amd-pstate: Modularize perf<->freq conversion")
|
||||
Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -144,10 +144,10 @@ static struct quirk_entry quirk_amd_7k62
|
||||
|
||||
static inline u8 freq_to_perf(struct amd_cpudata *cpudata, unsigned int freq_val)
|
||||
{
|
||||
- u8 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf,
|
||||
+ u32 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf,
|
||||
cpudata->nominal_freq);
|
||||
|
||||
- return clamp_t(u8, perf_val, cpudata->lowest_perf, cpudata->highest_perf);
|
||||
+ return (u8)clamp(perf_val, cpudata->lowest_perf, cpudata->highest_perf);
|
||||
}
|
||||
|
||||
static inline u32 perf_to_freq(struct amd_cpudata *cpudata, u8 perf_val)
|
@@ -1,35 +0,0 @@
|
||||
From 240a074b7f92278755df715be1ea5ea5d3d2f5ac Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:17 -0600
|
||||
Subject: cpufreq/amd-pstate: Show a warning when a CPU fails to setup
|
||||
|
||||
I came across a system that MSR_AMD_CPPC_CAP1 for some CPUs isn't
|
||||
populated. This is an unexpected behavior that is most likely a
|
||||
BIOS bug. In the event it happens I'd like users to report bugs
|
||||
to properly root cause and get this fixed.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1027,6 +1027,7 @@ static int amd_pstate_cpu_init(struct cp
|
||||
free_cpudata2:
|
||||
freq_qos_remove_request(&cpudata->req[0]);
|
||||
free_cpudata1:
|
||||
+ pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret);
|
||||
kfree(cpudata);
|
||||
return ret;
|
||||
}
|
||||
@@ -1520,6 +1521,7 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
return 0;
|
||||
|
||||
free_cpudata1:
|
||||
+ pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret);
|
||||
kfree(cpudata);
|
||||
return ret;
|
||||
}
|
@@ -1,209 +0,0 @@
|
||||
From 82520910e91d62f19c944ff17ba8f966553e79d6 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:18 -0600
|
||||
Subject: cpufreq/amd-pstate: Drop min and max cached frequencies
|
||||
|
||||
Use the perf_to_freq helpers to calculate this on the fly.
|
||||
As the members are no longer cached add an extra check into
|
||||
amd_pstate_epp_update_limit() to avoid unnecessary calls in
|
||||
amd_pstate_update_min_max_limit().
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 14 +++++------
|
||||
drivers/cpufreq/amd-pstate.c | 43 +++++++++------------------------
|
||||
drivers/cpufreq/amd-pstate.h | 9 ++-----
|
||||
3 files changed, 20 insertions(+), 46 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -214,14 +214,14 @@ static void amd_pstate_ut_check_freq(u32
|
||||
break;
|
||||
cpudata = policy->driver_data;
|
||||
|
||||
- if (!((cpudata->max_freq >= cpudata->nominal_freq) &&
|
||||
+ if (!((policy->cpuinfo.max_freq >= cpudata->nominal_freq) &&
|
||||
(cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) &&
|
||||
- (cpudata->lowest_nonlinear_freq > cpudata->min_freq) &&
|
||||
- (cpudata->min_freq > 0))) {
|
||||
+ (cpudata->lowest_nonlinear_freq > policy->cpuinfo.min_freq) &&
|
||||
+ (policy->cpuinfo.min_freq > 0))) {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n",
|
||||
- __func__, cpu, cpudata->max_freq, cpudata->nominal_freq,
|
||||
- cpudata->lowest_nonlinear_freq, cpudata->min_freq);
|
||||
+ __func__, cpu, policy->cpuinfo.max_freq, cpudata->nominal_freq,
|
||||
+ cpudata->lowest_nonlinear_freq, policy->cpuinfo.min_freq);
|
||||
goto skip_test;
|
||||
}
|
||||
|
||||
@@ -233,13 +233,13 @@ static void amd_pstate_ut_check_freq(u32
|
||||
}
|
||||
|
||||
if (cpudata->boost_supported) {
|
||||
- if ((policy->max == cpudata->max_freq) ||
|
||||
+ if ((policy->max == policy->cpuinfo.max_freq) ||
|
||||
(policy->max == cpudata->nominal_freq))
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
|
||||
else {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n",
|
||||
- __func__, cpu, policy->max, cpudata->max_freq,
|
||||
+ __func__, cpu, policy->max, policy->cpuinfo.max_freq,
|
||||
cpudata->nominal_freq);
|
||||
goto skip_test;
|
||||
}
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -717,7 +717,7 @@ static int amd_pstate_cpu_boost_update(s
|
||||
int ret = 0;
|
||||
|
||||
nominal_freq = READ_ONCE(cpudata->nominal_freq);
|
||||
- max_freq = READ_ONCE(cpudata->max_freq);
|
||||
+ max_freq = perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf));
|
||||
|
||||
if (on)
|
||||
policy->cpuinfo.max_freq = max_freq;
|
||||
@@ -916,13 +916,10 @@ static int amd_pstate_init_freq(struct a
|
||||
nominal_freq *= 1000;
|
||||
|
||||
WRITE_ONCE(cpudata->nominal_freq, nominal_freq);
|
||||
- WRITE_ONCE(cpudata->min_freq, min_freq);
|
||||
|
||||
max_freq = perf_to_freq(cpudata, cpudata->highest_perf);
|
||||
lowest_nonlinear_freq = perf_to_freq(cpudata, cpudata->lowest_nonlinear_perf);
|
||||
-
|
||||
WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq);
|
||||
- WRITE_ONCE(cpudata->max_freq, max_freq);
|
||||
|
||||
/**
|
||||
* Below values need to be initialized correctly, otherwise driver will fail to load
|
||||
@@ -947,9 +944,9 @@ static int amd_pstate_init_freq(struct a
|
||||
|
||||
static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
|
||||
{
|
||||
- int min_freq, max_freq, ret;
|
||||
- struct device *dev;
|
||||
struct amd_cpudata *cpudata;
|
||||
+ struct device *dev;
|
||||
+ int ret;
|
||||
|
||||
/*
|
||||
* Resetting PERF_CTL_MSR will put the CPU in P0 frequency,
|
||||
@@ -980,17 +977,11 @@ static int amd_pstate_cpu_init(struct cp
|
||||
if (ret)
|
||||
goto free_cpudata1;
|
||||
|
||||
- min_freq = READ_ONCE(cpudata->min_freq);
|
||||
- max_freq = READ_ONCE(cpudata->max_freq);
|
||||
-
|
||||
policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu);
|
||||
policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu);
|
||||
|
||||
- policy->min = min_freq;
|
||||
- policy->max = max_freq;
|
||||
-
|
||||
- policy->cpuinfo.min_freq = min_freq;
|
||||
- policy->cpuinfo.max_freq = max_freq;
|
||||
+ policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf);
|
||||
+ policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf);
|
||||
|
||||
policy->boost_enabled = READ_ONCE(cpudata->boost_supported);
|
||||
|
||||
@@ -1014,9 +1005,6 @@ static int amd_pstate_cpu_init(struct cp
|
||||
goto free_cpudata2;
|
||||
}
|
||||
|
||||
- cpudata->max_limit_freq = max_freq;
|
||||
- cpudata->min_limit_freq = min_freq;
|
||||
-
|
||||
policy->driver_data = cpudata;
|
||||
|
||||
if (!current_pstate_driver->adjust_perf)
|
||||
@@ -1074,14 +1062,10 @@ static int amd_pstate_cpu_suspend(struct
|
||||
static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy,
|
||||
char *buf)
|
||||
{
|
||||
- int max_freq;
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
|
||||
- max_freq = READ_ONCE(cpudata->max_freq);
|
||||
- if (max_freq < 0)
|
||||
- return max_freq;
|
||||
|
||||
- return sysfs_emit(buf, "%u\n", max_freq);
|
||||
+ return sysfs_emit(buf, "%u\n", perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf)));
|
||||
}
|
||||
|
||||
static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy,
|
||||
@@ -1439,10 +1423,10 @@ static bool amd_pstate_acpi_pm_profile_u
|
||||
|
||||
static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
|
||||
{
|
||||
- int min_freq, max_freq, ret;
|
||||
struct amd_cpudata *cpudata;
|
||||
struct device *dev;
|
||||
u64 value;
|
||||
+ int ret;
|
||||
|
||||
/*
|
||||
* Resetting PERF_CTL_MSR will put the CPU in P0 frequency,
|
||||
@@ -1473,19 +1457,13 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
if (ret)
|
||||
goto free_cpudata1;
|
||||
|
||||
- min_freq = READ_ONCE(cpudata->min_freq);
|
||||
- max_freq = READ_ONCE(cpudata->max_freq);
|
||||
-
|
||||
- policy->cpuinfo.min_freq = min_freq;
|
||||
- policy->cpuinfo.max_freq = max_freq;
|
||||
+ policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf);
|
||||
+ policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf);
|
||||
/* It will be updated by governor */
|
||||
policy->cur = policy->cpuinfo.min_freq;
|
||||
|
||||
policy->driver_data = cpudata;
|
||||
|
||||
- policy->min = policy->cpuinfo.min_freq;
|
||||
- policy->max = policy->cpuinfo.max_freq;
|
||||
-
|
||||
policy->boost_enabled = READ_ONCE(cpudata->boost_supported);
|
||||
|
||||
/*
|
||||
@@ -1543,7 +1521,8 @@ static int amd_pstate_epp_update_limit(s
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
u8 epp;
|
||||
|
||||
- amd_pstate_update_min_max_limit(policy);
|
||||
+ if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
|
||||
+ amd_pstate_update_min_max_limit(policy);
|
||||
|
||||
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
epp = 0;
|
||||
--- a/drivers/cpufreq/amd-pstate.h
|
||||
+++ b/drivers/cpufreq/amd-pstate.h
|
||||
@@ -46,8 +46,6 @@ struct amd_aperf_mperf {
|
||||
* @max_limit_perf: Cached value of the performance corresponding to policy->max
|
||||
* @min_limit_freq: Cached value of policy->min (in khz)
|
||||
* @max_limit_freq: Cached value of policy->max (in khz)
|
||||
- * @max_freq: the frequency (in khz) that mapped to highest_perf
|
||||
- * @min_freq: the frequency (in khz) that mapped to lowest_perf
|
||||
* @nominal_freq: the frequency (in khz) that mapped to nominal_perf
|
||||
* @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf
|
||||
* @cur: Difference of Aperf/Mperf/tsc count between last and current sample
|
||||
@@ -77,11 +75,8 @@ struct amd_cpudata {
|
||||
u8 prefcore_ranking;
|
||||
u8 min_limit_perf;
|
||||
u8 max_limit_perf;
|
||||
- u32 min_limit_freq;
|
||||
- u32 max_limit_freq;
|
||||
-
|
||||
- u32 max_freq;
|
||||
- u32 min_freq;
|
||||
+ u32 min_limit_freq;
|
||||
+ u32 max_limit_freq;
|
||||
u32 nominal_freq;
|
||||
u32 lowest_nonlinear_freq;
|
||||
|
@@ -1,611 +0,0 @@
|
||||
From 21109b42429e0d9f0ee1bfadddae38fb5b0b23c3 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:19 -0600
|
||||
Subject: cpufreq/amd-pstate: Move perf values into a union
|
||||
|
||||
By storing perf values in a union all the writes and reads can
|
||||
be done atomically, removing the need for some concurrency protections.
|
||||
|
||||
While making this change, also drop the cached frequency values,
|
||||
using inline helpers to calculate them on demand from perf value.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 18 +--
|
||||
drivers/cpufreq/amd-pstate.c | 205 ++++++++++++++++++--------------
|
||||
drivers/cpufreq/amd-pstate.h | 51 +++++---
|
||||
3 files changed, 158 insertions(+), 116 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -129,6 +129,7 @@ static void amd_pstate_ut_check_perf(u32
|
||||
struct cppc_perf_caps cppc_perf;
|
||||
struct cpufreq_policy *policy = NULL;
|
||||
struct amd_cpudata *cpudata = NULL;
|
||||
+ union perf_cached cur_perf;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
policy = cpufreq_cpu_get(cpu);
|
||||
@@ -162,19 +163,20 @@ static void amd_pstate_ut_check_perf(u32
|
||||
lowest_perf = AMD_CPPC_LOWEST_PERF(cap1);
|
||||
}
|
||||
|
||||
- if (highest_perf != READ_ONCE(cpudata->highest_perf) && !cpudata->hw_prefcore) {
|
||||
+ cur_perf = READ_ONCE(cpudata->perf);
|
||||
+ if (highest_perf != cur_perf.highest_perf && !cpudata->hw_prefcore) {
|
||||
pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n",
|
||||
- __func__, cpu, highest_perf, cpudata->highest_perf);
|
||||
+ __func__, cpu, highest_perf, cur_perf.highest_perf);
|
||||
goto skip_test;
|
||||
}
|
||||
- if ((nominal_perf != READ_ONCE(cpudata->nominal_perf)) ||
|
||||
- (lowest_nonlinear_perf != READ_ONCE(cpudata->lowest_nonlinear_perf)) ||
|
||||
- (lowest_perf != READ_ONCE(cpudata->lowest_perf))) {
|
||||
+ if (nominal_perf != cur_perf.nominal_perf ||
|
||||
+ (lowest_nonlinear_perf != cur_perf.lowest_nonlinear_perf) ||
|
||||
+ (lowest_perf != cur_perf.lowest_perf)) {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d nominal=%d %d lowest_nonlinear=%d %d lowest=%d %d, they should be equal!\n",
|
||||
- __func__, cpu, nominal_perf, cpudata->nominal_perf,
|
||||
- lowest_nonlinear_perf, cpudata->lowest_nonlinear_perf,
|
||||
- lowest_perf, cpudata->lowest_perf);
|
||||
+ __func__, cpu, nominal_perf, cur_perf.nominal_perf,
|
||||
+ lowest_nonlinear_perf, cur_perf.lowest_nonlinear_perf,
|
||||
+ lowest_perf, cur_perf.lowest_perf);
|
||||
goto skip_test;
|
||||
}
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -142,18 +142,17 @@ static struct quirk_entry quirk_amd_7k62
|
||||
.lowest_freq = 550,
|
||||
};
|
||||
|
||||
-static inline u8 freq_to_perf(struct amd_cpudata *cpudata, unsigned int freq_val)
|
||||
+static inline u8 freq_to_perf(union perf_cached perf, u32 nominal_freq, unsigned int freq_val)
|
||||
{
|
||||
- u32 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf,
|
||||
- cpudata->nominal_freq);
|
||||
+ u32 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * perf.nominal_perf, nominal_freq);
|
||||
|
||||
- return (u8)clamp(perf_val, cpudata->lowest_perf, cpudata->highest_perf);
|
||||
+ return (u8)clamp(perf_val, perf.lowest_perf, perf.highest_perf);
|
||||
}
|
||||
|
||||
-static inline u32 perf_to_freq(struct amd_cpudata *cpudata, u8 perf_val)
|
||||
+static inline u32 perf_to_freq(union perf_cached perf, u32 nominal_freq, u8 perf_val)
|
||||
{
|
||||
- return DIV_ROUND_UP_ULL((u64)cpudata->nominal_freq * perf_val,
|
||||
- cpudata->nominal_perf);
|
||||
+ return DIV_ROUND_UP_ULL((u64)nominal_freq * perf_val,
|
||||
+ perf.nominal_perf);
|
||||
}
|
||||
|
||||
static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
|
||||
@@ -347,7 +346,9 @@ static int amd_pstate_set_energy_pref_in
|
||||
}
|
||||
|
||||
if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
+
|
||||
+ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
|
||||
epp,
|
||||
FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached),
|
||||
FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached),
|
||||
@@ -425,6 +426,7 @@ static inline int amd_pstate_cppc_enable
|
||||
|
||||
static int msr_init_perf(struct amd_cpudata *cpudata)
|
||||
{
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
u64 cap1, numerator;
|
||||
|
||||
int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
|
||||
@@ -436,19 +438,21 @@ static int msr_init_perf(struct amd_cpud
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
- WRITE_ONCE(cpudata->highest_perf, numerator);
|
||||
- WRITE_ONCE(cpudata->max_limit_perf, numerator);
|
||||
- WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
|
||||
- WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
|
||||
- WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
|
||||
+ perf.highest_perf = numerator;
|
||||
+ perf.max_limit_perf = numerator;
|
||||
+ perf.min_limit_perf = AMD_CPPC_LOWEST_PERF(cap1);
|
||||
+ perf.nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1);
|
||||
+ perf.lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1);
|
||||
+ perf.lowest_perf = AMD_CPPC_LOWEST_PERF(cap1);
|
||||
+ WRITE_ONCE(cpudata->perf, perf);
|
||||
WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1));
|
||||
- WRITE_ONCE(cpudata->min_limit_perf, AMD_CPPC_LOWEST_PERF(cap1));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int shmem_init_perf(struct amd_cpudata *cpudata)
|
||||
{
|
||||
struct cppc_perf_caps cppc_perf;
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
u64 numerator;
|
||||
|
||||
int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
|
||||
@@ -459,14 +463,14 @@ static int shmem_init_perf(struct amd_cp
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
- WRITE_ONCE(cpudata->highest_perf, numerator);
|
||||
- WRITE_ONCE(cpudata->max_limit_perf, numerator);
|
||||
- WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
|
||||
- WRITE_ONCE(cpudata->lowest_nonlinear_perf,
|
||||
- cppc_perf.lowest_nonlinear_perf);
|
||||
- WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf);
|
||||
+ perf.highest_perf = numerator;
|
||||
+ perf.max_limit_perf = numerator;
|
||||
+ perf.min_limit_perf = cppc_perf.lowest_perf;
|
||||
+ perf.nominal_perf = cppc_perf.nominal_perf;
|
||||
+ perf.lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf;
|
||||
+ perf.lowest_perf = cppc_perf.lowest_perf;
|
||||
+ WRITE_ONCE(cpudata->perf, perf);
|
||||
WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf);
|
||||
- WRITE_ONCE(cpudata->min_limit_perf, cppc_perf.lowest_perf);
|
||||
|
||||
if (cppc_state == AMD_PSTATE_ACTIVE)
|
||||
return 0;
|
||||
@@ -549,14 +553,14 @@ static void amd_pstate_update(struct amd
|
||||
u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags)
|
||||
{
|
||||
struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu);
|
||||
- u8 nominal_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
|
||||
if (!policy)
|
||||
return;
|
||||
|
||||
des_perf = clamp_t(u8, des_perf, min_perf, max_perf);
|
||||
|
||||
- policy->cur = perf_to_freq(cpudata, des_perf);
|
||||
+ policy->cur = perf_to_freq(perf, cpudata->nominal_freq, des_perf);
|
||||
|
||||
if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) {
|
||||
min_perf = des_perf;
|
||||
@@ -565,7 +569,7 @@ static void amd_pstate_update(struct amd
|
||||
|
||||
/* limit the max perf when core performance boost feature is disabled */
|
||||
if (!cpudata->boost_supported)
|
||||
- max_perf = min_t(u8, nominal_perf, max_perf);
|
||||
+ max_perf = min_t(u8, perf.nominal_perf, max_perf);
|
||||
|
||||
if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) {
|
||||
trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
|
||||
@@ -602,39 +606,41 @@ static int amd_pstate_verify(struct cpuf
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
|
||||
+static void amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
|
||||
{
|
||||
- u8 max_limit_perf, min_limit_perf;
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
|
||||
- max_limit_perf = freq_to_perf(cpudata, policy->max);
|
||||
- min_limit_perf = freq_to_perf(cpudata, policy->min);
|
||||
+ perf.max_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->max);
|
||||
+ perf.min_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->min);
|
||||
|
||||
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
- min_limit_perf = min(cpudata->nominal_perf, max_limit_perf);
|
||||
+ perf.min_limit_perf = min(perf.nominal_perf, perf.max_limit_perf);
|
||||
|
||||
- WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
|
||||
- WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
|
||||
WRITE_ONCE(cpudata->max_limit_freq, policy->max);
|
||||
WRITE_ONCE(cpudata->min_limit_freq, policy->min);
|
||||
-
|
||||
- return 0;
|
||||
+ WRITE_ONCE(cpudata->perf, perf);
|
||||
}
|
||||
|
||||
static int amd_pstate_update_freq(struct cpufreq_policy *policy,
|
||||
unsigned int target_freq, bool fast_switch)
|
||||
{
|
||||
struct cpufreq_freqs freqs;
|
||||
- struct amd_cpudata *cpudata = policy->driver_data;
|
||||
+ struct amd_cpudata *cpudata;
|
||||
+ union perf_cached perf;
|
||||
u8 des_perf;
|
||||
|
||||
+ cpudata = policy->driver_data;
|
||||
+
|
||||
if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
|
||||
amd_pstate_update_min_max_limit(policy);
|
||||
|
||||
+ perf = READ_ONCE(cpudata->perf);
|
||||
+
|
||||
freqs.old = policy->cur;
|
||||
freqs.new = target_freq;
|
||||
|
||||
- des_perf = freq_to_perf(cpudata, target_freq);
|
||||
+ des_perf = freq_to_perf(perf, cpudata->nominal_freq, target_freq);
|
||||
|
||||
WARN_ON(fast_switch && !policy->fast_switch_enabled);
|
||||
/*
|
||||
@@ -645,8 +651,8 @@ static int amd_pstate_update_freq(struct
|
||||
if (!fast_switch)
|
||||
cpufreq_freq_transition_begin(policy, &freqs);
|
||||
|
||||
- amd_pstate_update(cpudata, cpudata->min_limit_perf, des_perf,
|
||||
- cpudata->max_limit_perf, fast_switch,
|
||||
+ amd_pstate_update(cpudata, perf.min_limit_perf, des_perf,
|
||||
+ perf.max_limit_perf, fast_switch,
|
||||
policy->governor->flags);
|
||||
|
||||
if (!fast_switch)
|
||||
@@ -675,9 +681,10 @@ static void amd_pstate_adjust_perf(unsig
|
||||
unsigned long target_perf,
|
||||
unsigned long capacity)
|
||||
{
|
||||
- u8 max_perf, min_perf, des_perf, cap_perf, min_limit_perf;
|
||||
+ u8 max_perf, min_perf, des_perf, cap_perf;
|
||||
struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu);
|
||||
struct amd_cpudata *cpudata;
|
||||
+ union perf_cached perf;
|
||||
|
||||
if (!policy)
|
||||
return;
|
||||
@@ -687,8 +694,8 @@ static void amd_pstate_adjust_perf(unsig
|
||||
if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
|
||||
amd_pstate_update_min_max_limit(policy);
|
||||
|
||||
- cap_perf = READ_ONCE(cpudata->highest_perf);
|
||||
- min_limit_perf = READ_ONCE(cpudata->min_limit_perf);
|
||||
+ perf = READ_ONCE(cpudata->perf);
|
||||
+ cap_perf = perf.highest_perf;
|
||||
|
||||
des_perf = cap_perf;
|
||||
if (target_perf < capacity)
|
||||
@@ -699,10 +706,10 @@ static void amd_pstate_adjust_perf(unsig
|
||||
else
|
||||
min_perf = cap_perf;
|
||||
|
||||
- if (min_perf < min_limit_perf)
|
||||
- min_perf = min_limit_perf;
|
||||
+ if (min_perf < perf.min_limit_perf)
|
||||
+ min_perf = perf.min_limit_perf;
|
||||
|
||||
- max_perf = cpudata->max_limit_perf;
|
||||
+ max_perf = perf.max_limit_perf;
|
||||
if (max_perf < min_perf)
|
||||
max_perf = min_perf;
|
||||
|
||||
@@ -713,11 +720,12 @@ static void amd_pstate_adjust_perf(unsig
|
||||
static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
u32 nominal_freq, max_freq;
|
||||
int ret = 0;
|
||||
|
||||
nominal_freq = READ_ONCE(cpudata->nominal_freq);
|
||||
- max_freq = perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf));
|
||||
+ max_freq = perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf);
|
||||
|
||||
if (on)
|
||||
policy->cpuinfo.max_freq = max_freq;
|
||||
@@ -881,30 +889,30 @@ static u32 amd_pstate_get_transition_lat
|
||||
}
|
||||
|
||||
/*
|
||||
- * amd_pstate_init_freq: Initialize the max_freq, min_freq,
|
||||
- * nominal_freq and lowest_nonlinear_freq for
|
||||
- * the @cpudata object.
|
||||
+ * amd_pstate_init_freq: Initialize the nominal_freq and lowest_nonlinear_freq
|
||||
+ * for the @cpudata object.
|
||||
*
|
||||
- * Requires: highest_perf, lowest_perf, nominal_perf and
|
||||
- * lowest_nonlinear_perf members of @cpudata to be
|
||||
- * initialized.
|
||||
+ * Requires: all perf members of @cpudata to be initialized.
|
||||
*
|
||||
- * Returns 0 on success, non-zero value on failure.
|
||||
+ * Returns 0 on success, non-zero value on failure.
|
||||
*/
|
||||
static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
|
||||
{
|
||||
- int ret;
|
||||
- u32 min_freq, max_freq;
|
||||
- u32 nominal_freq, lowest_nonlinear_freq;
|
||||
+ u32 min_freq, max_freq, nominal_freq, lowest_nonlinear_freq;
|
||||
struct cppc_perf_caps cppc_perf;
|
||||
+ union perf_cached perf;
|
||||
+ int ret;
|
||||
|
||||
ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
|
||||
if (ret)
|
||||
return ret;
|
||||
+ perf = READ_ONCE(cpudata->perf);
|
||||
|
||||
- if (quirks && quirks->lowest_freq)
|
||||
+ if (quirks && quirks->lowest_freq) {
|
||||
min_freq = quirks->lowest_freq;
|
||||
- else
|
||||
+ perf.lowest_perf = freq_to_perf(perf, nominal_freq, min_freq);
|
||||
+ WRITE_ONCE(cpudata->perf, perf);
|
||||
+ } else
|
||||
min_freq = cppc_perf.lowest_freq;
|
||||
|
||||
if (quirks && quirks->nominal_freq)
|
||||
@@ -917,8 +925,8 @@ static int amd_pstate_init_freq(struct a
|
||||
|
||||
WRITE_ONCE(cpudata->nominal_freq, nominal_freq);
|
||||
|
||||
- max_freq = perf_to_freq(cpudata, cpudata->highest_perf);
|
||||
- lowest_nonlinear_freq = perf_to_freq(cpudata, cpudata->lowest_nonlinear_perf);
|
||||
+ max_freq = perf_to_freq(perf, nominal_freq, perf.highest_perf);
|
||||
+ lowest_nonlinear_freq = perf_to_freq(perf, nominal_freq, perf.lowest_nonlinear_perf);
|
||||
WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq);
|
||||
|
||||
/**
|
||||
@@ -945,6 +953,7 @@ static int amd_pstate_init_freq(struct a
|
||||
static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata;
|
||||
+ union perf_cached perf;
|
||||
struct device *dev;
|
||||
int ret;
|
||||
|
||||
@@ -980,8 +989,14 @@ static int amd_pstate_cpu_init(struct cp
|
||||
policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu);
|
||||
policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu);
|
||||
|
||||
- policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf);
|
||||
- policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf);
|
||||
+ perf = READ_ONCE(cpudata->perf);
|
||||
+
|
||||
+ policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf,
|
||||
+ cpudata->nominal_freq,
|
||||
+ perf.lowest_perf);
|
||||
+ policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf,
|
||||
+ cpudata->nominal_freq,
|
||||
+ perf.highest_perf);
|
||||
|
||||
policy->boost_enabled = READ_ONCE(cpudata->boost_supported);
|
||||
|
||||
@@ -1062,23 +1077,27 @@ static int amd_pstate_cpu_suspend(struct
|
||||
static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy,
|
||||
char *buf)
|
||||
{
|
||||
- struct amd_cpudata *cpudata = policy->driver_data;
|
||||
+ struct amd_cpudata *cpudata;
|
||||
+ union perf_cached perf;
|
||||
|
||||
+ cpudata = policy->driver_data;
|
||||
+ perf = READ_ONCE(cpudata->perf);
|
||||
|
||||
- return sysfs_emit(buf, "%u\n", perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf)));
|
||||
+ return sysfs_emit(buf, "%u\n",
|
||||
+ perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf));
|
||||
}
|
||||
|
||||
static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy,
|
||||
char *buf)
|
||||
{
|
||||
- int freq;
|
||||
- struct amd_cpudata *cpudata = policy->driver_data;
|
||||
+ struct amd_cpudata *cpudata;
|
||||
+ union perf_cached perf;
|
||||
|
||||
- freq = READ_ONCE(cpudata->lowest_nonlinear_freq);
|
||||
- if (freq < 0)
|
||||
- return freq;
|
||||
+ cpudata = policy->driver_data;
|
||||
+ perf = READ_ONCE(cpudata->perf);
|
||||
|
||||
- return sysfs_emit(buf, "%u\n", freq);
|
||||
+ return sysfs_emit(buf, "%u\n",
|
||||
+ perf_to_freq(perf, cpudata->nominal_freq, perf.lowest_nonlinear_perf));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1088,12 +1107,11 @@ static ssize_t show_amd_pstate_lowest_no
|
||||
static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy,
|
||||
char *buf)
|
||||
{
|
||||
- u8 perf;
|
||||
- struct amd_cpudata *cpudata = policy->driver_data;
|
||||
+ struct amd_cpudata *cpudata;
|
||||
|
||||
- perf = READ_ONCE(cpudata->highest_perf);
|
||||
+ cpudata = policy->driver_data;
|
||||
|
||||
- return sysfs_emit(buf, "%u\n", perf);
|
||||
+ return sysfs_emit(buf, "%u\n", cpudata->perf.highest_perf);
|
||||
}
|
||||
|
||||
static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy,
|
||||
@@ -1424,6 +1442,7 @@ static bool amd_pstate_acpi_pm_profile_u
|
||||
static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata;
|
||||
+ union perf_cached perf;
|
||||
struct device *dev;
|
||||
u64 value;
|
||||
int ret;
|
||||
@@ -1457,8 +1476,15 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
if (ret)
|
||||
goto free_cpudata1;
|
||||
|
||||
- policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf);
|
||||
- policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf);
|
||||
+ perf = READ_ONCE(cpudata->perf);
|
||||
+
|
||||
+ policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf,
|
||||
+ cpudata->nominal_freq,
|
||||
+ perf.lowest_perf);
|
||||
+ policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf,
|
||||
+ cpudata->nominal_freq,
|
||||
+ perf.highest_perf);
|
||||
+
|
||||
/* It will be updated by governor */
|
||||
policy->cur = policy->cpuinfo.min_freq;
|
||||
|
||||
@@ -1519,6 +1545,7 @@ static void amd_pstate_epp_cpu_exit(stru
|
||||
static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
+ union perf_cached perf;
|
||||
u8 epp;
|
||||
|
||||
if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
|
||||
@@ -1529,15 +1556,16 @@ static int amd_pstate_epp_update_limit(s
|
||||
else
|
||||
epp = READ_ONCE(cpudata->epp_cached);
|
||||
|
||||
+ perf = READ_ONCE(cpudata->perf);
|
||||
if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp,
|
||||
- cpudata->min_limit_perf,
|
||||
- cpudata->max_limit_perf,
|
||||
+ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, epp,
|
||||
+ perf.min_limit_perf,
|
||||
+ perf.max_limit_perf,
|
||||
policy->boost_enabled);
|
||||
}
|
||||
|
||||
- return amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
|
||||
- cpudata->max_limit_perf, epp, false);
|
||||
+ return amd_pstate_update_perf(cpudata, perf.min_limit_perf, 0U,
|
||||
+ perf.max_limit_perf, epp, false);
|
||||
}
|
||||
|
||||
static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
|
||||
@@ -1569,20 +1597,18 @@ static int amd_pstate_epp_set_policy(str
|
||||
static int amd_pstate_epp_reenable(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- u8 max_perf;
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
int ret;
|
||||
|
||||
ret = amd_pstate_cppc_enable(true);
|
||||
if (ret)
|
||||
pr_err("failed to enable amd pstate during resume, return %d\n", ret);
|
||||
|
||||
- max_perf = READ_ONCE(cpudata->highest_perf);
|
||||
-
|
||||
if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
|
||||
+ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
|
||||
cpudata->epp_cached,
|
||||
FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached),
|
||||
- max_perf, policy->boost_enabled);
|
||||
+ perf.highest_perf, policy->boost_enabled);
|
||||
}
|
||||
|
||||
return amd_pstate_epp_update_limit(policy);
|
||||
@@ -1606,22 +1632,21 @@ static int amd_pstate_epp_cpu_online(str
|
||||
static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- u8 min_perf;
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
|
||||
if (cpudata->suspended)
|
||||
return 0;
|
||||
|
||||
- min_perf = READ_ONCE(cpudata->lowest_perf);
|
||||
-
|
||||
guard(mutex)(&amd_pstate_limits_lock);
|
||||
|
||||
if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
|
||||
+ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
|
||||
AMD_CPPC_EPP_BALANCE_POWERSAVE,
|
||||
- min_perf, min_perf, policy->boost_enabled);
|
||||
+ perf.lowest_perf, perf.lowest_perf,
|
||||
+ policy->boost_enabled);
|
||||
}
|
||||
|
||||
- return amd_pstate_update_perf(cpudata, min_perf, 0, min_perf,
|
||||
+ return amd_pstate_update_perf(cpudata, perf.lowest_perf, 0, perf.lowest_perf,
|
||||
AMD_CPPC_EPP_BALANCE_POWERSAVE, false);
|
||||
}
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.h
|
||||
+++ b/drivers/cpufreq/amd-pstate.h
|
||||
@@ -13,6 +13,36 @@
|
||||
/*********************************************************************
|
||||
* AMD P-state INTERFACE *
|
||||
*********************************************************************/
|
||||
+
|
||||
+/**
|
||||
+ * union perf_cached - A union to cache performance-related data.
|
||||
+ * @highest_perf: the maximum performance an individual processor may reach,
|
||||
+ * assuming ideal conditions
|
||||
+ * For platforms that support the preferred core feature, the highest_perf value maybe
|
||||
+ * configured to any value in the range 166-255 by the firmware (because the preferred
|
||||
+ * core ranking is encoded in the highest_perf value). To maintain consistency across
|
||||
+ * all platforms, we split the highest_perf and preferred core ranking values into
|
||||
+ * cpudata->perf.highest_perf and cpudata->prefcore_ranking.
|
||||
+ * @nominal_perf: the maximum sustained performance level of the processor,
|
||||
+ * assuming ideal operating conditions
|
||||
+ * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power
|
||||
+ * savings are achieved
|
||||
+ * @lowest_perf: the absolute lowest performance level of the processor
|
||||
+ * @min_limit_perf: Cached value of the performance corresponding to policy->min
|
||||
+ * @max_limit_perf: Cached value of the performance corresponding to policy->max
|
||||
+ */
|
||||
+union perf_cached {
|
||||
+ struct {
|
||||
+ u8 highest_perf;
|
||||
+ u8 nominal_perf;
|
||||
+ u8 lowest_nonlinear_perf;
|
||||
+ u8 lowest_perf;
|
||||
+ u8 min_limit_perf;
|
||||
+ u8 max_limit_perf;
|
||||
+ };
|
||||
+ u64 val;
|
||||
+};
|
||||
+
|
||||
/**
|
||||
* struct amd_aperf_mperf
|
||||
* @aperf: actual performance frequency clock count
|
||||
@@ -30,20 +60,9 @@ struct amd_aperf_mperf {
|
||||
* @cpu: CPU number
|
||||
* @req: constraint request to apply
|
||||
* @cppc_req_cached: cached performance request hints
|
||||
- * @highest_perf: the maximum performance an individual processor may reach,
|
||||
- * assuming ideal conditions
|
||||
- * For platforms that do not support the preferred core feature, the
|
||||
- * highest_pef may be configured with 166 or 255, to avoid max frequency
|
||||
- * calculated wrongly. we take the fixed value as the highest_perf.
|
||||
- * @nominal_perf: the maximum sustained performance level of the processor,
|
||||
- * assuming ideal operating conditions
|
||||
- * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power
|
||||
- * savings are achieved
|
||||
- * @lowest_perf: the absolute lowest performance level of the processor
|
||||
+ * @perf: cached performance-related data
|
||||
* @prefcore_ranking: the preferred core ranking, the higher value indicates a higher
|
||||
* priority.
|
||||
- * @min_limit_perf: Cached value of the performance corresponding to policy->min
|
||||
- * @max_limit_perf: Cached value of the performance corresponding to policy->max
|
||||
* @min_limit_freq: Cached value of policy->min (in khz)
|
||||
* @max_limit_freq: Cached value of policy->max (in khz)
|
||||
* @nominal_freq: the frequency (in khz) that mapped to nominal_perf
|
||||
@@ -68,13 +87,9 @@ struct amd_cpudata {
|
||||
struct freq_qos_request req[2];
|
||||
u64 cppc_req_cached;
|
||||
|
||||
- u8 highest_perf;
|
||||
- u8 nominal_perf;
|
||||
- u8 lowest_nonlinear_perf;
|
||||
- u8 lowest_perf;
|
||||
+ union perf_cached perf;
|
||||
+
|
||||
u8 prefcore_ranking;
|
||||
- u8 min_limit_perf;
|
||||
- u8 max_limit_perf;
|
||||
u32 min_limit_freq;
|
||||
u32 max_limit_freq;
|
||||
u32 nominal_freq;
|
@@ -1,81 +0,0 @@
|
||||
From 0daee82069cfe4a322bed954a4a5f19226e49e95 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:20 -0600
|
||||
Subject: cpufreq/amd-pstate: Overhaul locking
|
||||
|
||||
amd_pstate_cpu_boost_update() and refresh_frequency_limits() both
|
||||
update the policy state and have nothing to do with the amd-pstate
|
||||
driver itself.
|
||||
|
||||
A global "limits" lock doesn't make sense because each CPU can have
|
||||
policies changed independently. Each time a CPU changes values they
|
||||
will atomically be written to the per-CPU perf member. Drop per CPU
|
||||
locking cases.
|
||||
|
||||
The remaining "global" driver lock is used to ensure that only one
|
||||
entity can change driver modes at a given time.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 13 +++----------
|
||||
1 file changed, 3 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -196,7 +196,6 @@ static inline int get_mode_idx_from_str(
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
-static DEFINE_MUTEX(amd_pstate_limits_lock);
|
||||
static DEFINE_MUTEX(amd_pstate_driver_lock);
|
||||
|
||||
static u8 msr_get_epp(struct amd_cpudata *cpudata)
|
||||
@@ -1169,8 +1168,6 @@ static ssize_t store_energy_performance_
|
||||
if (ret < 0)
|
||||
return -EINVAL;
|
||||
|
||||
- guard(mutex)(&amd_pstate_limits_lock);
|
||||
-
|
||||
ret = amd_pstate_set_energy_pref_index(policy, ret);
|
||||
|
||||
return ret ? ret : count;
|
||||
@@ -1343,8 +1340,10 @@ int amd_pstate_update_status(const char
|
||||
if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
- if (mode_state_machine[cppc_state][mode_idx])
|
||||
+ if (mode_state_machine[cppc_state][mode_idx]) {
|
||||
+ guard(mutex)(&amd_pstate_driver_lock);
|
||||
return mode_state_machine[cppc_state][mode_idx](mode_idx);
|
||||
+ }
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1365,7 +1364,6 @@ static ssize_t status_store(struct devic
|
||||
char *p = memchr(buf, '\n', count);
|
||||
int ret;
|
||||
|
||||
- guard(mutex)(&amd_pstate_driver_lock);
|
||||
ret = amd_pstate_update_status(buf, p ? p - buf : count);
|
||||
|
||||
return ret < 0 ? ret : count;
|
||||
@@ -1637,8 +1635,6 @@ static int amd_pstate_epp_cpu_offline(st
|
||||
if (cpudata->suspended)
|
||||
return 0;
|
||||
|
||||
- guard(mutex)(&amd_pstate_limits_lock);
|
||||
-
|
||||
if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
|
||||
AMD_CPPC_EPP_BALANCE_POWERSAVE,
|
||||
@@ -1678,8 +1674,6 @@ static int amd_pstate_epp_resume(struct
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
|
||||
if (cpudata->suspended) {
|
||||
- guard(mutex)(&amd_pstate_limits_lock);
|
||||
-
|
||||
/* enable amd pstate from suspend state*/
|
||||
amd_pstate_epp_reenable(policy);
|
||||
|
@@ -1,48 +0,0 @@
|
||||
From 7c820a91ffd02aa7e426e8801893575f218a7a80 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:21 -0600
|
||||
Subject: cpufreq/amd-pstate: Drop `cppc_cap1_cached`
|
||||
|
||||
The `cppc_cap1_cached` variable isn't used at all, there is no
|
||||
need to read it at initialization for each CPU.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 5 -----
|
||||
drivers/cpufreq/amd-pstate.h | 2 --
|
||||
2 files changed, 7 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1508,11 +1508,6 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
if (ret)
|
||||
return ret;
|
||||
WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
-
|
||||
- ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
- WRITE_ONCE(cpudata->cppc_cap1_cached, value);
|
||||
}
|
||||
ret = amd_pstate_set_epp(cpudata, cpudata->epp_default);
|
||||
if (ret)
|
||||
--- a/drivers/cpufreq/amd-pstate.h
|
||||
+++ b/drivers/cpufreq/amd-pstate.h
|
||||
@@ -76,7 +76,6 @@ struct amd_aperf_mperf {
|
||||
* AMD P-State driver supports preferred core featue.
|
||||
* @epp_cached: Cached CPPC energy-performance preference value
|
||||
* @policy: Cpufreq policy value
|
||||
- * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value
|
||||
*
|
||||
* The amd_cpudata is key private data for each CPU thread in AMD P-State, and
|
||||
* represents all the attributes and goals that AMD P-State requests at runtime.
|
||||
@@ -105,7 +104,6 @@ struct amd_cpudata {
|
||||
/* EPP feature related attributes*/
|
||||
u8 epp_cached;
|
||||
u32 policy;
|
||||
- u64 cppc_cap1_cached;
|
||||
bool suspended;
|
||||
u8 epp_default;
|
||||
};
|
@@ -1,144 +0,0 @@
|
||||
From 5d0c340db98de378a11abfbaf587b6e601e7291c Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:22 -0600
|
||||
Subject: cpufreq/amd-pstate-ut: Use _free macro to free put policy
|
||||
|
||||
Using a scoped cleanup macro simplifies cleanup code.
|
||||
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 33 ++++++++++++++-------------------
|
||||
1 file changed, 14 insertions(+), 19 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/fs.h>
|
||||
+#include <linux/cleanup.h>
|
||||
|
||||
#include <acpi/cppc_acpi.h>
|
||||
|
||||
@@ -127,11 +128,12 @@ static void amd_pstate_ut_check_perf(u32
|
||||
u32 highest_perf = 0, nominal_perf = 0, lowest_nonlinear_perf = 0, lowest_perf = 0;
|
||||
u64 cap1 = 0;
|
||||
struct cppc_perf_caps cppc_perf;
|
||||
- struct cpufreq_policy *policy = NULL;
|
||||
struct amd_cpudata *cpudata = NULL;
|
||||
union perf_cached cur_perf;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
+ struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL;
|
||||
+
|
||||
policy = cpufreq_cpu_get(cpu);
|
||||
if (!policy)
|
||||
break;
|
||||
@@ -142,7 +144,7 @@ static void amd_pstate_ut_check_perf(u32
|
||||
if (ret) {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cppc_get_perf_caps ret=%d error!\n", __func__, ret);
|
||||
- goto skip_test;
|
||||
+ return;
|
||||
}
|
||||
|
||||
highest_perf = cppc_perf.highest_perf;
|
||||
@@ -154,7 +156,7 @@ static void amd_pstate_ut_check_perf(u32
|
||||
if (ret) {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s read CPPC_CAP1 ret=%d error!\n", __func__, ret);
|
||||
- goto skip_test;
|
||||
+ return;
|
||||
}
|
||||
|
||||
highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
|
||||
@@ -167,7 +169,7 @@ static void amd_pstate_ut_check_perf(u32
|
||||
if (highest_perf != cur_perf.highest_perf && !cpudata->hw_prefcore) {
|
||||
pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n",
|
||||
__func__, cpu, highest_perf, cur_perf.highest_perf);
|
||||
- goto skip_test;
|
||||
+ return;
|
||||
}
|
||||
if (nominal_perf != cur_perf.nominal_perf ||
|
||||
(lowest_nonlinear_perf != cur_perf.lowest_nonlinear_perf) ||
|
||||
@@ -177,7 +179,7 @@ static void amd_pstate_ut_check_perf(u32
|
||||
__func__, cpu, nominal_perf, cur_perf.nominal_perf,
|
||||
lowest_nonlinear_perf, cur_perf.lowest_nonlinear_perf,
|
||||
lowest_perf, cur_perf.lowest_perf);
|
||||
- goto skip_test;
|
||||
+ return;
|
||||
}
|
||||
|
||||
if (!((highest_perf >= nominal_perf) &&
|
||||
@@ -188,15 +190,11 @@ static void amd_pstate_ut_check_perf(u32
|
||||
pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n",
|
||||
__func__, cpu, highest_perf, nominal_perf,
|
||||
lowest_nonlinear_perf, lowest_perf);
|
||||
- goto skip_test;
|
||||
+ return;
|
||||
}
|
||||
- cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
|
||||
- return;
|
||||
-skip_test:
|
||||
- cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -207,10 +205,11 @@ skip_test:
|
||||
static void amd_pstate_ut_check_freq(u32 index)
|
||||
{
|
||||
int cpu = 0;
|
||||
- struct cpufreq_policy *policy = NULL;
|
||||
struct amd_cpudata *cpudata = NULL;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
+ struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL;
|
||||
+
|
||||
policy = cpufreq_cpu_get(cpu);
|
||||
if (!policy)
|
||||
break;
|
||||
@@ -224,14 +223,14 @@ static void amd_pstate_ut_check_freq(u32
|
||||
pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n",
|
||||
__func__, cpu, policy->cpuinfo.max_freq, cpudata->nominal_freq,
|
||||
cpudata->lowest_nonlinear_freq, policy->cpuinfo.min_freq);
|
||||
- goto skip_test;
|
||||
+ return;
|
||||
}
|
||||
|
||||
if (cpudata->lowest_nonlinear_freq != policy->min) {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n",
|
||||
__func__, cpu, cpudata->lowest_nonlinear_freq, policy->min);
|
||||
- goto skip_test;
|
||||
+ return;
|
||||
}
|
||||
|
||||
if (cpudata->boost_supported) {
|
||||
@@ -243,20 +242,16 @@ static void amd_pstate_ut_check_freq(u32
|
||||
pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n",
|
||||
__func__, cpu, policy->max, policy->cpuinfo.max_freq,
|
||||
cpudata->nominal_freq);
|
||||
- goto skip_test;
|
||||
+ return;
|
||||
}
|
||||
} else {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d must support boost!\n", __func__, cpu);
|
||||
- goto skip_test;
|
||||
+ return;
|
||||
}
|
||||
- cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
|
||||
- return;
|
||||
-skip_test:
|
||||
- cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
static int amd_pstate_set_mode(enum amd_pstate_mode mode)
|
@@ -1,37 +0,0 @@
|
||||
From 8937b7068ca30072c4c4cf4c22000112afbd6839 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:23 -0600
|
||||
Subject: cpufreq/amd-pstate-ut: Allow lowest nonlinear and lowest to be the
|
||||
same
|
||||
|
||||
Several Ryzen AI processors support the exact same value for lowest
|
||||
nonlinear perf and lowest perf. Loosen up the unit tests to allow this
|
||||
scenario.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -184,7 +184,7 @@ static void amd_pstate_ut_check_perf(u32
|
||||
|
||||
if (!((highest_perf >= nominal_perf) &&
|
||||
(nominal_perf > lowest_nonlinear_perf) &&
|
||||
- (lowest_nonlinear_perf > lowest_perf) &&
|
||||
+ (lowest_nonlinear_perf >= lowest_perf) &&
|
||||
(lowest_perf > 0))) {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n",
|
||||
@@ -217,7 +217,7 @@ static void amd_pstate_ut_check_freq(u32
|
||||
|
||||
if (!((policy->cpuinfo.max_freq >= cpudata->nominal_freq) &&
|
||||
(cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) &&
|
||||
- (cpudata->lowest_nonlinear_freq > policy->cpuinfo.min_freq) &&
|
||||
+ (cpudata->lowest_nonlinear_freq >= policy->cpuinfo.min_freq) &&
|
||||
(policy->cpuinfo.min_freq > 0))) {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n",
|
@@ -1,309 +0,0 @@
|
||||
From 8cb701e059fa08dcb9ab74e3c84abc224ff72714 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:24 -0600
|
||||
Subject: cpufreq/amd-pstate-ut: Drop SUCCESS and FAIL enums
|
||||
|
||||
Enums are effectively used as a boolean and don't show
|
||||
the return value of the failing call.
|
||||
|
||||
Instead of using enums switch to returning the actual return
|
||||
code from the unit test.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 143 ++++++++++++--------------------
|
||||
1 file changed, 55 insertions(+), 88 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -32,30 +32,20 @@
|
||||
|
||||
#include "amd-pstate.h"
|
||||
|
||||
-/*
|
||||
- * Abbreviations:
|
||||
- * amd_pstate_ut: used as a shortform for AMD P-State unit test.
|
||||
- * It helps to keep variable names smaller, simpler
|
||||
- */
|
||||
-enum amd_pstate_ut_result {
|
||||
- AMD_PSTATE_UT_RESULT_PASS,
|
||||
- AMD_PSTATE_UT_RESULT_FAIL,
|
||||
-};
|
||||
|
||||
struct amd_pstate_ut_struct {
|
||||
const char *name;
|
||||
- void (*func)(u32 index);
|
||||
- enum amd_pstate_ut_result result;
|
||||
+ int (*func)(u32 index);
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel module for testing the AMD P-State unit test
|
||||
*/
|
||||
-static void amd_pstate_ut_acpi_cpc_valid(u32 index);
|
||||
-static void amd_pstate_ut_check_enabled(u32 index);
|
||||
-static void amd_pstate_ut_check_perf(u32 index);
|
||||
-static void amd_pstate_ut_check_freq(u32 index);
|
||||
-static void amd_pstate_ut_check_driver(u32 index);
|
||||
+static int amd_pstate_ut_acpi_cpc_valid(u32 index);
|
||||
+static int amd_pstate_ut_check_enabled(u32 index);
|
||||
+static int amd_pstate_ut_check_perf(u32 index);
|
||||
+static int amd_pstate_ut_check_freq(u32 index);
|
||||
+static int amd_pstate_ut_check_driver(u32 index);
|
||||
|
||||
static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = {
|
||||
{"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid },
|
||||
@@ -78,51 +68,46 @@ static bool get_shared_mem(void)
|
||||
/*
|
||||
* check the _CPC object is present in SBIOS.
|
||||
*/
|
||||
-static void amd_pstate_ut_acpi_cpc_valid(u32 index)
|
||||
+static int amd_pstate_ut_acpi_cpc_valid(u32 index)
|
||||
{
|
||||
- if (acpi_cpc_valid())
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
|
||||
- else {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
+ if (!acpi_cpc_valid()) {
|
||||
pr_err("%s the _CPC object is not present in SBIOS!\n", __func__);
|
||||
+ return -EINVAL;
|
||||
}
|
||||
+
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
-static void amd_pstate_ut_pstate_enable(u32 index)
|
||||
+/*
|
||||
+ * check if amd pstate is enabled
|
||||
+ */
|
||||
+static int amd_pstate_ut_check_enabled(u32 index)
|
||||
{
|
||||
- int ret = 0;
|
||||
u64 cppc_enable = 0;
|
||||
+ int ret;
|
||||
+
|
||||
+ if (get_shared_mem())
|
||||
+ return 0;
|
||||
|
||||
ret = rdmsrl_safe(MSR_AMD_CPPC_ENABLE, &cppc_enable);
|
||||
if (ret) {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s rdmsrl_safe MSR_AMD_CPPC_ENABLE ret=%d error!\n", __func__, ret);
|
||||
- return;
|
||||
+ return ret;
|
||||
}
|
||||
- if (cppc_enable)
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
|
||||
- else {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
+
|
||||
+ if (!cppc_enable) {
|
||||
pr_err("%s amd pstate must be enabled!\n", __func__);
|
||||
+ return -EINVAL;
|
||||
}
|
||||
-}
|
||||
|
||||
-/*
|
||||
- * check if amd pstate is enabled
|
||||
- */
|
||||
-static void amd_pstate_ut_check_enabled(u32 index)
|
||||
-{
|
||||
- if (get_shared_mem())
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
|
||||
- else
|
||||
- amd_pstate_ut_pstate_enable(index);
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* check if performance values are reasonable.
|
||||
* highest_perf >= nominal_perf > lowest_nonlinear_perf > lowest_perf > 0
|
||||
*/
|
||||
-static void amd_pstate_ut_check_perf(u32 index)
|
||||
+static int amd_pstate_ut_check_perf(u32 index)
|
||||
{
|
||||
int cpu = 0, ret = 0;
|
||||
u32 highest_perf = 0, nominal_perf = 0, lowest_nonlinear_perf = 0, lowest_perf = 0;
|
||||
@@ -142,9 +127,8 @@ static void amd_pstate_ut_check_perf(u32
|
||||
if (get_shared_mem()) {
|
||||
ret = cppc_get_perf_caps(cpu, &cppc_perf);
|
||||
if (ret) {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cppc_get_perf_caps ret=%d error!\n", __func__, ret);
|
||||
- return;
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
highest_perf = cppc_perf.highest_perf;
|
||||
@@ -154,9 +138,8 @@ static void amd_pstate_ut_check_perf(u32
|
||||
} else {
|
||||
ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
|
||||
if (ret) {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s read CPPC_CAP1 ret=%d error!\n", __func__, ret);
|
||||
- return;
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
|
||||
@@ -169,32 +152,30 @@ static void amd_pstate_ut_check_perf(u32
|
||||
if (highest_perf != cur_perf.highest_perf && !cpudata->hw_prefcore) {
|
||||
pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n",
|
||||
__func__, cpu, highest_perf, cur_perf.highest_perf);
|
||||
- return;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
if (nominal_perf != cur_perf.nominal_perf ||
|
||||
(lowest_nonlinear_perf != cur_perf.lowest_nonlinear_perf) ||
|
||||
(lowest_perf != cur_perf.lowest_perf)) {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d nominal=%d %d lowest_nonlinear=%d %d lowest=%d %d, they should be equal!\n",
|
||||
__func__, cpu, nominal_perf, cur_perf.nominal_perf,
|
||||
lowest_nonlinear_perf, cur_perf.lowest_nonlinear_perf,
|
||||
lowest_perf, cur_perf.lowest_perf);
|
||||
- return;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
|
||||
if (!((highest_perf >= nominal_perf) &&
|
||||
(nominal_perf > lowest_nonlinear_perf) &&
|
||||
(lowest_nonlinear_perf >= lowest_perf) &&
|
||||
(lowest_perf > 0))) {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n",
|
||||
__func__, cpu, highest_perf, nominal_perf,
|
||||
lowest_nonlinear_perf, lowest_perf);
|
||||
- return;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -202,7 +183,7 @@ static void amd_pstate_ut_check_perf(u32
|
||||
* max_freq >= nominal_freq > lowest_nonlinear_freq > min_freq > 0
|
||||
* check max freq when set support boost mode.
|
||||
*/
|
||||
-static void amd_pstate_ut_check_freq(u32 index)
|
||||
+static int amd_pstate_ut_check_freq(u32 index)
|
||||
{
|
||||
int cpu = 0;
|
||||
struct amd_cpudata *cpudata = NULL;
|
||||
@@ -219,39 +200,33 @@ static void amd_pstate_ut_check_freq(u32
|
||||
(cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) &&
|
||||
(cpudata->lowest_nonlinear_freq >= policy->cpuinfo.min_freq) &&
|
||||
(policy->cpuinfo.min_freq > 0))) {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n",
|
||||
__func__, cpu, policy->cpuinfo.max_freq, cpudata->nominal_freq,
|
||||
cpudata->lowest_nonlinear_freq, policy->cpuinfo.min_freq);
|
||||
- return;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
|
||||
if (cpudata->lowest_nonlinear_freq != policy->min) {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n",
|
||||
__func__, cpu, cpudata->lowest_nonlinear_freq, policy->min);
|
||||
- return;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
|
||||
if (cpudata->boost_supported) {
|
||||
- if ((policy->max == policy->cpuinfo.max_freq) ||
|
||||
- (policy->max == cpudata->nominal_freq))
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
|
||||
- else {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
+ if ((policy->max != policy->cpuinfo.max_freq) &&
|
||||
+ (policy->max != cpudata->nominal_freq)) {
|
||||
pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n",
|
||||
__func__, cpu, policy->max, policy->cpuinfo.max_freq,
|
||||
cpudata->nominal_freq);
|
||||
- return;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
} else {
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
pr_err("%s cpu%d must support boost!\n", __func__, cpu);
|
||||
- return;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
- amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static int amd_pstate_set_mode(enum amd_pstate_mode mode)
|
||||
@@ -263,32 +238,28 @@ static int amd_pstate_set_mode(enum amd_
|
||||
return amd_pstate_update_status(mode_str, strlen(mode_str));
|
||||
}
|
||||
|
||||
-static void amd_pstate_ut_check_driver(u32 index)
|
||||
+static int amd_pstate_ut_check_driver(u32 index)
|
||||
{
|
||||
enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE;
|
||||
- int ret;
|
||||
|
||||
for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {
|
||||
- ret = amd_pstate_set_mode(mode1);
|
||||
+ int ret = amd_pstate_set_mode(mode1);
|
||||
if (ret)
|
||||
- goto out;
|
||||
+ return ret;
|
||||
for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) {
|
||||
if (mode1 == mode2)
|
||||
continue;
|
||||
ret = amd_pstate_set_mode(mode2);
|
||||
- if (ret)
|
||||
- goto out;
|
||||
+ if (ret) {
|
||||
+ pr_err("%s: failed to update status for %s->%s\n", __func__,
|
||||
+ amd_pstate_get_mode_string(mode1),
|
||||
+ amd_pstate_get_mode_string(mode2));
|
||||
+ return ret;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
-out:
|
||||
- if (ret)
|
||||
- pr_warn("%s: failed to update status for %s->%s: %d\n", __func__,
|
||||
- amd_pstate_get_mode_string(mode1),
|
||||
- amd_pstate_get_mode_string(mode2), ret);
|
||||
-
|
||||
- amd_pstate_ut_cases[index].result = ret ?
|
||||
- AMD_PSTATE_UT_RESULT_FAIL :
|
||||
- AMD_PSTATE_UT_RESULT_PASS;
|
||||
+
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static int __init amd_pstate_ut_init(void)
|
||||
@@ -296,16 +267,12 @@ static int __init amd_pstate_ut_init(voi
|
||||
u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases);
|
||||
|
||||
for (i = 0; i < arr_size; i++) {
|
||||
- amd_pstate_ut_cases[i].func(i);
|
||||
- switch (amd_pstate_ut_cases[i].result) {
|
||||
- case AMD_PSTATE_UT_RESULT_PASS:
|
||||
+ int ret = amd_pstate_ut_cases[i].func(i);
|
||||
+
|
||||
+ if (ret)
|
||||
+ pr_err("%-4d %-20s\t fail: %d!\n", i+1, amd_pstate_ut_cases[i].name, ret);
|
||||
+ else
|
||||
pr_info("%-4d %-20s\t success!\n", i+1, amd_pstate_ut_cases[i].name);
|
||||
- break;
|
||||
- case AMD_PSTATE_UT_RESULT_FAIL:
|
||||
- default:
|
||||
- pr_info("%-4d %-20s\t fail!\n", i+1, amd_pstate_ut_cases[i].name);
|
||||
- break;
|
||||
- }
|
||||
}
|
||||
|
||||
return 0;
|
@@ -1,50 +0,0 @@
|
||||
From c553e0165997349a3f831fa04bdd7f61913a3442 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:25 -0600
|
||||
Subject: cpufreq/amd-pstate-ut: Run on all of the correct CPUs
|
||||
|
||||
If a CPU is missing a policy or one has been offlined then the unit test
|
||||
is skipped for the rest of the CPUs on the system.
|
||||
|
||||
Instead; iterate online CPUs and skip any missing policies to allow
|
||||
continuing to test the rest of them.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 8 ++++----
|
||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -116,12 +116,12 @@ static int amd_pstate_ut_check_perf(u32
|
||||
struct amd_cpudata *cpudata = NULL;
|
||||
union perf_cached cur_perf;
|
||||
|
||||
- for_each_possible_cpu(cpu) {
|
||||
+ for_each_online_cpu(cpu) {
|
||||
struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL;
|
||||
|
||||
policy = cpufreq_cpu_get(cpu);
|
||||
if (!policy)
|
||||
- break;
|
||||
+ continue;
|
||||
cpudata = policy->driver_data;
|
||||
|
||||
if (get_shared_mem()) {
|
||||
@@ -188,12 +188,12 @@ static int amd_pstate_ut_check_freq(u32
|
||||
int cpu = 0;
|
||||
struct amd_cpudata *cpudata = NULL;
|
||||
|
||||
- for_each_possible_cpu(cpu) {
|
||||
+ for_each_online_cpu(cpu) {
|
||||
struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL;
|
||||
|
||||
policy = cpufreq_cpu_get(cpu);
|
||||
if (!policy)
|
||||
- break;
|
||||
+ continue;
|
||||
cpudata = policy->driver_data;
|
||||
|
||||
if (!((policy->cpuinfo.max_freq >= cpudata->nominal_freq) &&
|
@@ -1,42 +0,0 @@
|
||||
From c4197fd693cb98a8a71557187a7cf592d6b68b3c Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:26 -0600
|
||||
Subject: cpufreq/amd-pstate-ut: Adjust variable scope
|
||||
|
||||
In amd_pstate_ut_check_freq() and amd_pstate_ut_check_perf() the cpudata
|
||||
variable is only needed in the scope of the for loop. Move it there.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -113,11 +113,11 @@ static int amd_pstate_ut_check_perf(u32
|
||||
u32 highest_perf = 0, nominal_perf = 0, lowest_nonlinear_perf = 0, lowest_perf = 0;
|
||||
u64 cap1 = 0;
|
||||
struct cppc_perf_caps cppc_perf;
|
||||
- struct amd_cpudata *cpudata = NULL;
|
||||
union perf_cached cur_perf;
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL;
|
||||
+ struct amd_cpudata *cpudata;
|
||||
|
||||
policy = cpufreq_cpu_get(cpu);
|
||||
if (!policy)
|
||||
@@ -186,10 +186,10 @@ static int amd_pstate_ut_check_perf(u32
|
||||
static int amd_pstate_ut_check_freq(u32 index)
|
||||
{
|
||||
int cpu = 0;
|
||||
- struct amd_cpudata *cpudata = NULL;
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL;
|
||||
+ struct amd_cpudata *cpudata;
|
||||
|
||||
policy = cpufreq_cpu_get(cpu);
|
||||
if (!policy)
|
@@ -1,123 +0,0 @@
|
||||
From 19c375251767f49b62894d3b4782f0b8b01313b8 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:27 -0600
|
||||
Subject: cpufreq/amd-pstate: Replace all AMD_CPPC_* macros with masks
|
||||
|
||||
Bitfield masks are easier to follow and less error prone.
|
||||
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
arch/x86/include/asm/msr-index.h | 20 +++++++++++---------
|
||||
arch/x86/kernel/acpi/cppc.c | 4 +++-
|
||||
drivers/cpufreq/amd-pstate-ut.c | 9 +++++----
|
||||
drivers/cpufreq/amd-pstate.c | 16 ++++++----------
|
||||
4 files changed, 25 insertions(+), 24 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/msr-index.h
|
||||
+++ b/arch/x86/include/asm/msr-index.h
|
||||
@@ -709,15 +709,17 @@
|
||||
#define MSR_AMD_CPPC_REQ 0xc00102b3
|
||||
#define MSR_AMD_CPPC_STATUS 0xc00102b4
|
||||
|
||||
-#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff)
|
||||
-#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff)
|
||||
-#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff)
|
||||
-#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff)
|
||||
+/* Masks for use with MSR_AMD_CPPC_CAP1 */
|
||||
+#define AMD_CPPC_LOWEST_PERF_MASK GENMASK(7, 0)
|
||||
+#define AMD_CPPC_LOWNONLIN_PERF_MASK GENMASK(15, 8)
|
||||
+#define AMD_CPPC_NOMINAL_PERF_MASK GENMASK(23, 16)
|
||||
+#define AMD_CPPC_HIGHEST_PERF_MASK GENMASK(31, 24)
|
||||
|
||||
-#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0)
|
||||
-#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8)
|
||||
-#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16)
|
||||
-#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24)
|
||||
+/* Masks for use with MSR_AMD_CPPC_REQ */
|
||||
+#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0)
|
||||
+#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8)
|
||||
+#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16)
|
||||
+#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24)
|
||||
|
||||
/* AMD Performance Counter Global Status and Control MSRs */
|
||||
#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300
|
||||
--- a/arch/x86/kernel/acpi/cppc.c
|
||||
+++ b/arch/x86/kernel/acpi/cppc.c
|
||||
@@ -4,6 +4,8 @@
|
||||
* Copyright (c) 2016, Intel Corporation.
|
||||
*/
|
||||
|
||||
+#include <linux/bitfield.h>
|
||||
+
|
||||
#include <acpi/cppc_acpi.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/processor.h>
|
||||
@@ -149,7 +151,7 @@ int amd_get_highest_perf(unsigned int cp
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
- val = AMD_CPPC_HIGHEST_PERF(val);
|
||||
+ val = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, val);
|
||||
} else {
|
||||
ret = cppc_get_highest_perf(cpu, &val);
|
||||
if (ret)
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
+#include <linux/bitfield.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/moduleparam.h>
|
||||
@@ -142,10 +143,10 @@ static int amd_pstate_ut_check_perf(u32
|
||||
return ret;
|
||||
}
|
||||
|
||||
- highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
|
||||
- nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1);
|
||||
- lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1);
|
||||
- lowest_perf = AMD_CPPC_LOWEST_PERF(cap1);
|
||||
+ highest_perf = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, cap1);
|
||||
+ nominal_perf = FIELD_GET(AMD_CPPC_NOMINAL_PERF_MASK, cap1);
|
||||
+ lowest_nonlinear_perf = FIELD_GET(AMD_CPPC_LOWNONLIN_PERF_MASK, cap1);
|
||||
+ lowest_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1);
|
||||
}
|
||||
|
||||
cur_perf = READ_ONCE(cpudata->perf);
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -89,11 +89,6 @@ static bool cppc_enabled;
|
||||
static bool amd_pstate_prefcore = true;
|
||||
static struct quirk_entry *quirks;
|
||||
|
||||
-#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0)
|
||||
-#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8)
|
||||
-#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16)
|
||||
-#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24)
|
||||
-
|
||||
/*
|
||||
* AMD Energy Preference Performance (EPP)
|
||||
* The EPP is used in the CCLK DPM controller to drive
|
||||
@@ -439,12 +434,13 @@ static int msr_init_perf(struct amd_cpud
|
||||
|
||||
perf.highest_perf = numerator;
|
||||
perf.max_limit_perf = numerator;
|
||||
- perf.min_limit_perf = AMD_CPPC_LOWEST_PERF(cap1);
|
||||
- perf.nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1);
|
||||
- perf.lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1);
|
||||
- perf.lowest_perf = AMD_CPPC_LOWEST_PERF(cap1);
|
||||
+ perf.min_limit_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1);
|
||||
+ perf.nominal_perf = FIELD_GET(AMD_CPPC_NOMINAL_PERF_MASK, cap1);
|
||||
+ perf.lowest_nonlinear_perf = FIELD_GET(AMD_CPPC_LOWNONLIN_PERF_MASK, cap1);
|
||||
+ perf.lowest_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1);
|
||||
WRITE_ONCE(cpudata->perf, perf);
|
||||
- WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1));
|
||||
+ WRITE_ONCE(cpudata->prefcore_ranking, FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, cap1));
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
@@ -1,60 +0,0 @@
|
||||
From bb7fadf4a86e19b52cbe850c9274bfa643d3ce52 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:28 -0600
|
||||
Subject: cpufreq/amd-pstate: Cache CPPC request in shared mem case too
|
||||
|
||||
In order to prevent a potential write for shmem_update_perf()
|
||||
cache the request into the cppc_req_cached variable normally only
|
||||
used for the MSR case.
|
||||
|
||||
This adds symmetry into the code and potentially avoids extra writes.
|
||||
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 22 +++++++++++++++++++++-
|
||||
1 file changed, 21 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -496,6 +496,8 @@ static int shmem_update_perf(struct amd_
|
||||
u8 des_perf, u8 max_perf, u8 epp, bool fast_switch)
|
||||
{
|
||||
struct cppc_perf_ctrls perf_ctrls;
|
||||
+ u64 value, prev;
|
||||
+ int ret;
|
||||
|
||||
if (cppc_state == AMD_PSTATE_ACTIVE) {
|
||||
int ret = shmem_set_epp(cpudata, epp);
|
||||
@@ -504,11 +506,29 @@ static int shmem_update_perf(struct amd_
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ value = prev = READ_ONCE(cpudata->cppc_req_cached);
|
||||
+
|
||||
+ value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK |
|
||||
+ AMD_CPPC_DES_PERF_MASK | AMD_CPPC_EPP_PERF_MASK);
|
||||
+ value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf);
|
||||
+ value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, des_perf);
|
||||
+ value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf);
|
||||
+ value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp);
|
||||
+
|
||||
+ if (value == prev)
|
||||
+ return 0;
|
||||
+
|
||||
perf_ctrls.max_perf = max_perf;
|
||||
perf_ctrls.min_perf = min_perf;
|
||||
perf_ctrls.desired_perf = des_perf;
|
||||
|
||||
- return cppc_set_perf(cpudata->cpu, &perf_ctrls);
|
||||
+ ret = cppc_set_perf(cpudata->cpu, &perf_ctrls);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
+ WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
+
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
|
@@ -1,318 +0,0 @@
|
||||
From e02f8a14d44223160d348d5841cc3dd916a14401 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:29 -0600
|
||||
Subject: cpufreq/amd-pstate: Move all EPP tracing into *_update_perf and
|
||||
*_set_epp functions
|
||||
|
||||
The EPP tracing is done by the caller today, but this precludes the
|
||||
information about whether the CPPC request has changed.
|
||||
|
||||
Move it into the update_perf and set_epp functions and include information
|
||||
about whether the request has changed from the last one.
|
||||
amd_pstate_update_perf() and amd_pstate_set_epp() now require the policy
|
||||
as an argument instead of the cpudata.
|
||||
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-trace.h | 13 +++-
|
||||
drivers/cpufreq/amd-pstate.c | 118 +++++++++++++++++------------
|
||||
2 files changed, 80 insertions(+), 51 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-trace.h
|
||||
+++ b/drivers/cpufreq/amd-pstate-trace.h
|
||||
@@ -90,7 +90,8 @@ TRACE_EVENT(amd_pstate_epp_perf,
|
||||
u8 epp,
|
||||
u8 min_perf,
|
||||
u8 max_perf,
|
||||
- bool boost
|
||||
+ bool boost,
|
||||
+ bool changed
|
||||
),
|
||||
|
||||
TP_ARGS(cpu_id,
|
||||
@@ -98,7 +99,8 @@ TRACE_EVENT(amd_pstate_epp_perf,
|
||||
epp,
|
||||
min_perf,
|
||||
max_perf,
|
||||
- boost),
|
||||
+ boost,
|
||||
+ changed),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned int, cpu_id)
|
||||
@@ -107,6 +109,7 @@ TRACE_EVENT(amd_pstate_epp_perf,
|
||||
__field(u8, min_perf)
|
||||
__field(u8, max_perf)
|
||||
__field(bool, boost)
|
||||
+ __field(bool, changed)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@@ -116,15 +119,17 @@ TRACE_EVENT(amd_pstate_epp_perf,
|
||||
__entry->min_perf = min_perf;
|
||||
__entry->max_perf = max_perf;
|
||||
__entry->boost = boost;
|
||||
+ __entry->changed = changed;
|
||||
),
|
||||
|
||||
- TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u",
|
||||
+ TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u, changed=%u",
|
||||
(unsigned int)__entry->cpu_id,
|
||||
(u8)__entry->min_perf,
|
||||
(u8)__entry->max_perf,
|
||||
(u8)__entry->highest_perf,
|
||||
(u8)__entry->epp,
|
||||
- (bool)__entry->boost
|
||||
+ (bool)__entry->boost,
|
||||
+ (bool)__entry->changed
|
||||
)
|
||||
);
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -228,9 +228,10 @@ static u8 shmem_get_epp(struct amd_cpuda
|
||||
return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, epp);
|
||||
}
|
||||
|
||||
-static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf,
|
||||
+static int msr_update_perf(struct cpufreq_policy *policy, u8 min_perf,
|
||||
u8 des_perf, u8 max_perf, u8 epp, bool fast_switch)
|
||||
{
|
||||
+ struct amd_cpudata *cpudata = policy->driver_data;
|
||||
u64 value, prev;
|
||||
|
||||
value = prev = READ_ONCE(cpudata->cppc_req_cached);
|
||||
@@ -242,6 +243,18 @@ static int msr_update_perf(struct amd_cp
|
||||
value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf);
|
||||
value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp);
|
||||
|
||||
+ if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
+
|
||||
+ trace_amd_pstate_epp_perf(cpudata->cpu,
|
||||
+ perf.highest_perf,
|
||||
+ epp,
|
||||
+ min_perf,
|
||||
+ max_perf,
|
||||
+ policy->boost_enabled,
|
||||
+ value != prev);
|
||||
+ }
|
||||
+
|
||||
if (value == prev)
|
||||
return 0;
|
||||
|
||||
@@ -256,24 +269,26 @@ static int msr_update_perf(struct amd_cp
|
||||
}
|
||||
|
||||
WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
- WRITE_ONCE(cpudata->epp_cached, epp);
|
||||
+ if (epp != cpudata->epp_cached)
|
||||
+ WRITE_ONCE(cpudata->epp_cached, epp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf);
|
||||
|
||||
-static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata,
|
||||
+static inline int amd_pstate_update_perf(struct cpufreq_policy *policy,
|
||||
u8 min_perf, u8 des_perf,
|
||||
u8 max_perf, u8 epp,
|
||||
bool fast_switch)
|
||||
{
|
||||
- return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf,
|
||||
+ return static_call(amd_pstate_update_perf)(policy, min_perf, des_perf,
|
||||
max_perf, epp, fast_switch);
|
||||
}
|
||||
|
||||
-static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp)
|
||||
+static int msr_set_epp(struct cpufreq_policy *policy, u8 epp)
|
||||
{
|
||||
+ struct amd_cpudata *cpudata = policy->driver_data;
|
||||
u64 value, prev;
|
||||
int ret;
|
||||
|
||||
@@ -281,6 +296,19 @@ static int msr_set_epp(struct amd_cpudat
|
||||
value &= ~AMD_CPPC_EPP_PERF_MASK;
|
||||
value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp);
|
||||
|
||||
+ if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
+ union perf_cached perf = cpudata->perf;
|
||||
+
|
||||
+ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
|
||||
+ epp,
|
||||
+ FIELD_GET(AMD_CPPC_MIN_PERF_MASK,
|
||||
+ cpudata->cppc_req_cached),
|
||||
+ FIELD_GET(AMD_CPPC_MAX_PERF_MASK,
|
||||
+ cpudata->cppc_req_cached),
|
||||
+ policy->boost_enabled,
|
||||
+ value != prev);
|
||||
+ }
|
||||
+
|
||||
if (value == prev)
|
||||
return 0;
|
||||
|
||||
@@ -299,15 +327,29 @@ static int msr_set_epp(struct amd_cpudat
|
||||
|
||||
DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp);
|
||||
|
||||
-static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u8 epp)
|
||||
+static inline int amd_pstate_set_epp(struct cpufreq_policy *policy, u8 epp)
|
||||
{
|
||||
- return static_call(amd_pstate_set_epp)(cpudata, epp);
|
||||
+ return static_call(amd_pstate_set_epp)(policy, epp);
|
||||
}
|
||||
|
||||
-static int shmem_set_epp(struct amd_cpudata *cpudata, u8 epp)
|
||||
+static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp)
|
||||
{
|
||||
- int ret;
|
||||
+ struct amd_cpudata *cpudata = policy->driver_data;
|
||||
struct cppc_perf_ctrls perf_ctrls;
|
||||
+ int ret;
|
||||
+
|
||||
+ if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
+ union perf_cached perf = cpudata->perf;
|
||||
+
|
||||
+ trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
|
||||
+ epp,
|
||||
+ FIELD_GET(AMD_CPPC_MIN_PERF_MASK,
|
||||
+ cpudata->cppc_req_cached),
|
||||
+ FIELD_GET(AMD_CPPC_MAX_PERF_MASK,
|
||||
+ cpudata->cppc_req_cached),
|
||||
+ policy->boost_enabled,
|
||||
+ epp != cpudata->epp_cached);
|
||||
+ }
|
||||
|
||||
if (epp == cpudata->epp_cached)
|
||||
return 0;
|
||||
@@ -339,17 +381,7 @@ static int amd_pstate_set_energy_pref_in
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
- if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
- union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
-
|
||||
- trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
|
||||
- epp,
|
||||
- FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached),
|
||||
- FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached),
|
||||
- policy->boost_enabled);
|
||||
- }
|
||||
-
|
||||
- return amd_pstate_set_epp(cpudata, epp);
|
||||
+ return amd_pstate_set_epp(policy, epp);
|
||||
}
|
||||
|
||||
static inline int msr_cppc_enable(bool enable)
|
||||
@@ -492,15 +524,16 @@ static inline int amd_pstate_init_perf(s
|
||||
return static_call(amd_pstate_init_perf)(cpudata);
|
||||
}
|
||||
|
||||
-static int shmem_update_perf(struct amd_cpudata *cpudata, u8 min_perf,
|
||||
+static int shmem_update_perf(struct cpufreq_policy *policy, u8 min_perf,
|
||||
u8 des_perf, u8 max_perf, u8 epp, bool fast_switch)
|
||||
{
|
||||
+ struct amd_cpudata *cpudata = policy->driver_data;
|
||||
struct cppc_perf_ctrls perf_ctrls;
|
||||
u64 value, prev;
|
||||
int ret;
|
||||
|
||||
if (cppc_state == AMD_PSTATE_ACTIVE) {
|
||||
- int ret = shmem_set_epp(cpudata, epp);
|
||||
+ int ret = shmem_set_epp(policy, epp);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
@@ -515,6 +548,18 @@ static int shmem_update_perf(struct amd_
|
||||
value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf);
|
||||
value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp);
|
||||
|
||||
+ if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
+ union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
+
|
||||
+ trace_amd_pstate_epp_perf(cpudata->cpu,
|
||||
+ perf.highest_perf,
|
||||
+ epp,
|
||||
+ min_perf,
|
||||
+ max_perf,
|
||||
+ policy->boost_enabled,
|
||||
+ value != prev);
|
||||
+ }
|
||||
+
|
||||
if (value == prev)
|
||||
return 0;
|
||||
|
||||
@@ -592,7 +637,7 @@ static void amd_pstate_update(struct amd
|
||||
cpudata->cpu, fast_switch);
|
||||
}
|
||||
|
||||
- amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch);
|
||||
+ amd_pstate_update_perf(policy, min_perf, des_perf, max_perf, 0, fast_switch);
|
||||
}
|
||||
|
||||
static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
|
||||
@@ -1525,7 +1570,7 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
return ret;
|
||||
WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
}
|
||||
- ret = amd_pstate_set_epp(cpudata, cpudata->epp_default);
|
||||
+ ret = amd_pstate_set_epp(policy, cpudata->epp_default);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -1566,14 +1611,8 @@ static int amd_pstate_epp_update_limit(s
|
||||
epp = READ_ONCE(cpudata->epp_cached);
|
||||
|
||||
perf = READ_ONCE(cpudata->perf);
|
||||
- if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
- trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, epp,
|
||||
- perf.min_limit_perf,
|
||||
- perf.max_limit_perf,
|
||||
- policy->boost_enabled);
|
||||
- }
|
||||
|
||||
- return amd_pstate_update_perf(cpudata, perf.min_limit_perf, 0U,
|
||||
+ return amd_pstate_update_perf(policy, perf.min_limit_perf, 0U,
|
||||
perf.max_limit_perf, epp, false);
|
||||
}
|
||||
|
||||
@@ -1605,20 +1644,12 @@ static int amd_pstate_epp_set_policy(str
|
||||
|
||||
static int amd_pstate_epp_reenable(struct cpufreq_policy *policy)
|
||||
{
|
||||
- struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
int ret;
|
||||
|
||||
ret = amd_pstate_cppc_enable(true);
|
||||
if (ret)
|
||||
pr_err("failed to enable amd pstate during resume, return %d\n", ret);
|
||||
|
||||
- if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
- trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
|
||||
- cpudata->epp_cached,
|
||||
- FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached),
|
||||
- perf.highest_perf, policy->boost_enabled);
|
||||
- }
|
||||
|
||||
return amd_pstate_epp_update_limit(policy);
|
||||
}
|
||||
@@ -1646,14 +1677,7 @@ static int amd_pstate_epp_cpu_offline(st
|
||||
if (cpudata->suspended)
|
||||
return 0;
|
||||
|
||||
- if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
- trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf,
|
||||
- AMD_CPPC_EPP_BALANCE_POWERSAVE,
|
||||
- perf.lowest_perf, perf.lowest_perf,
|
||||
- policy->boost_enabled);
|
||||
- }
|
||||
-
|
||||
- return amd_pstate_update_perf(cpudata, perf.lowest_perf, 0, perf.lowest_perf,
|
||||
+ return amd_pstate_update_perf(policy, perf.lowest_perf, 0, perf.lowest_perf,
|
||||
AMD_CPPC_EPP_BALANCE_POWERSAVE, false);
|
||||
}
|
||||
|
@@ -1,37 +0,0 @@
|
||||
From 5f0b3bf5497422293576a0783e47d203c52ed863 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:30 -0600
|
||||
Subject: cpufreq/amd-pstate: Update cppc_req_cached for shared mem EPP writes
|
||||
|
||||
On EPP only writes update the cached variable so that the min/max
|
||||
performance controls don't need to be updated again.
|
||||
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -336,6 +336,7 @@ static int shmem_set_epp(struct cpufreq_
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
struct cppc_perf_ctrls perf_ctrls;
|
||||
+ u64 value;
|
||||
int ret;
|
||||
|
||||
if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
@@ -362,6 +363,11 @@ static int shmem_set_epp(struct cpufreq_
|
||||
}
|
||||
WRITE_ONCE(cpudata->epp_cached, epp);
|
||||
|
||||
+ value = READ_ONCE(cpudata->cppc_req_cached);
|
||||
+ value &= ~AMD_CPPC_EPP_PERF_MASK;
|
||||
+ value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp);
|
||||
+ WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
+
|
||||
return ret;
|
||||
}
|
||||
|
@@ -1,38 +0,0 @@
|
||||
From 6c2201fe880d7d35fbde67d74ec1989f053cc0bd Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:31 -0600
|
||||
Subject: cpufreq/amd-pstate: Drop debug statements for policy setting
|
||||
|
||||
There are trace events that exist now for all amd-pstate modes that
|
||||
will output information right before programming to the hardware.
|
||||
|
||||
This makes the existing debug statements unnecessary remaining
|
||||
overhead. Drop them.
|
||||
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 4 ----
|
||||
1 file changed, 4 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -667,7 +667,6 @@ static int amd_pstate_verify(struct cpuf
|
||||
}
|
||||
|
||||
cpufreq_verify_within_cpu_limits(policy_data);
|
||||
- pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1630,9 +1629,6 @@ static int amd_pstate_epp_set_policy(str
|
||||
if (!policy->cpuinfo.max_freq)
|
||||
return -ENODEV;
|
||||
|
||||
- pr_debug("set_policy: cpuinfo.max %u policy->max %u\n",
|
||||
- policy->cpuinfo.max_freq, policy->max);
|
||||
-
|
||||
cpudata->policy = policy->policy;
|
||||
|
||||
ret = amd_pstate_epp_update_limit(policy);
|
@@ -1,327 +0,0 @@
|
||||
From 3c5030a27361deff20bec5d43339109901f3198c Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:32 -0600
|
||||
Subject: cpufreq/amd-pstate: Rework CPPC enabling
|
||||
|
||||
The CPPC enable register is configured as "write once". That is
|
||||
any future writes don't actually do anything.
|
||||
|
||||
Because of this, all the cleanup paths that currently exist for
|
||||
CPPC disable are non-effective.
|
||||
|
||||
Rework CPPC enable to only enable after all the CAP registers have
|
||||
been read to avoid enabling CPPC on CPUs with invalid _CPC or
|
||||
unpopulated MSRs.
|
||||
|
||||
As the register is write once, remove all cleanup paths as well.
|
||||
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 179 +++++++----------------------------
|
||||
1 file changed, 35 insertions(+), 144 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -85,7 +85,6 @@ static struct cpufreq_driver *current_ps
|
||||
static struct cpufreq_driver amd_pstate_driver;
|
||||
static struct cpufreq_driver amd_pstate_epp_driver;
|
||||
static int cppc_state = AMD_PSTATE_UNDEFINED;
|
||||
-static bool cppc_enabled;
|
||||
static bool amd_pstate_prefcore = true;
|
||||
static struct quirk_entry *quirks;
|
||||
|
||||
@@ -371,89 +370,21 @@ static int shmem_set_epp(struct cpufreq_
|
||||
return ret;
|
||||
}
|
||||
|
||||
-static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy,
|
||||
- int pref_index)
|
||||
+static inline int msr_cppc_enable(struct cpufreq_policy *policy)
|
||||
{
|
||||
- struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- u8 epp;
|
||||
-
|
||||
- if (!pref_index)
|
||||
- epp = cpudata->epp_default;
|
||||
- else
|
||||
- epp = epp_values[pref_index];
|
||||
-
|
||||
- if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
|
||||
- pr_debug("EPP cannot be set under performance policy\n");
|
||||
- return -EBUSY;
|
||||
- }
|
||||
-
|
||||
- return amd_pstate_set_epp(policy, epp);
|
||||
-}
|
||||
-
|
||||
-static inline int msr_cppc_enable(bool enable)
|
||||
-{
|
||||
- int ret, cpu;
|
||||
- unsigned long logical_proc_id_mask = 0;
|
||||
-
|
||||
- /*
|
||||
- * MSR_AMD_CPPC_ENABLE is write-once, once set it cannot be cleared.
|
||||
- */
|
||||
- if (!enable)
|
||||
- return 0;
|
||||
-
|
||||
- if (enable == cppc_enabled)
|
||||
- return 0;
|
||||
-
|
||||
- for_each_present_cpu(cpu) {
|
||||
- unsigned long logical_id = topology_logical_package_id(cpu);
|
||||
-
|
||||
- if (test_bit(logical_id, &logical_proc_id_mask))
|
||||
- continue;
|
||||
-
|
||||
- set_bit(logical_id, &logical_proc_id_mask);
|
||||
-
|
||||
- ret = wrmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_ENABLE,
|
||||
- enable);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
- }
|
||||
-
|
||||
- cppc_enabled = enable;
|
||||
- return 0;
|
||||
+ return wrmsrl_safe_on_cpu(policy->cpu, MSR_AMD_CPPC_ENABLE, 1);
|
||||
}
|
||||
|
||||
-static int shmem_cppc_enable(bool enable)
|
||||
+static int shmem_cppc_enable(struct cpufreq_policy *policy)
|
||||
{
|
||||
- int cpu, ret = 0;
|
||||
- struct cppc_perf_ctrls perf_ctrls;
|
||||
-
|
||||
- if (enable == cppc_enabled)
|
||||
- return 0;
|
||||
-
|
||||
- for_each_present_cpu(cpu) {
|
||||
- ret = cppc_set_enable(cpu, enable);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
-
|
||||
- /* Enable autonomous mode for EPP */
|
||||
- if (cppc_state == AMD_PSTATE_ACTIVE) {
|
||||
- /* Set desired perf as zero to allow EPP firmware control */
|
||||
- perf_ctrls.desired_perf = 0;
|
||||
- ret = cppc_set_perf(cpu, &perf_ctrls);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- cppc_enabled = enable;
|
||||
- return ret;
|
||||
+ return cppc_set_enable(policy->cpu, 1);
|
||||
}
|
||||
|
||||
DEFINE_STATIC_CALL(amd_pstate_cppc_enable, msr_cppc_enable);
|
||||
|
||||
-static inline int amd_pstate_cppc_enable(bool enable)
|
||||
+static inline int amd_pstate_cppc_enable(struct cpufreq_policy *policy)
|
||||
{
|
||||
- return static_call(amd_pstate_cppc_enable)(enable);
|
||||
+ return static_call(amd_pstate_cppc_enable)(policy);
|
||||
}
|
||||
|
||||
static int msr_init_perf(struct amd_cpudata *cpudata)
|
||||
@@ -1063,6 +994,10 @@ static int amd_pstate_cpu_init(struct cp
|
||||
cpudata->nominal_freq,
|
||||
perf.highest_perf);
|
||||
|
||||
+ ret = amd_pstate_cppc_enable(policy);
|
||||
+ if (ret)
|
||||
+ goto free_cpudata1;
|
||||
+
|
||||
policy->boost_enabled = READ_ONCE(cpudata->boost_supported);
|
||||
|
||||
/* It will be updated by governor */
|
||||
@@ -1110,28 +1045,6 @@ static void amd_pstate_cpu_exit(struct c
|
||||
kfree(cpudata);
|
||||
}
|
||||
|
||||
-static int amd_pstate_cpu_resume(struct cpufreq_policy *policy)
|
||||
-{
|
||||
- int ret;
|
||||
-
|
||||
- ret = amd_pstate_cppc_enable(true);
|
||||
- if (ret)
|
||||
- pr_err("failed to enable amd-pstate during resume, return %d\n", ret);
|
||||
-
|
||||
- return ret;
|
||||
-}
|
||||
-
|
||||
-static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy)
|
||||
-{
|
||||
- int ret;
|
||||
-
|
||||
- ret = amd_pstate_cppc_enable(false);
|
||||
- if (ret)
|
||||
- pr_err("failed to disable amd-pstate during suspend, return %d\n", ret);
|
||||
-
|
||||
- return ret;
|
||||
-}
|
||||
-
|
||||
/* Sysfs attributes */
|
||||
|
||||
/*
|
||||
@@ -1223,8 +1136,10 @@ static ssize_t show_energy_performance_a
|
||||
static ssize_t store_energy_performance_preference(
|
||||
struct cpufreq_policy *policy, const char *buf, size_t count)
|
||||
{
|
||||
+ struct amd_cpudata *cpudata = policy->driver_data;
|
||||
char str_preference[21];
|
||||
ssize_t ret;
|
||||
+ u8 epp;
|
||||
|
||||
ret = sscanf(buf, "%20s", str_preference);
|
||||
if (ret != 1)
|
||||
@@ -1234,7 +1149,17 @@ static ssize_t store_energy_performance_
|
||||
if (ret < 0)
|
||||
return -EINVAL;
|
||||
|
||||
- ret = amd_pstate_set_energy_pref_index(policy, ret);
|
||||
+ if (!ret)
|
||||
+ epp = cpudata->epp_default;
|
||||
+ else
|
||||
+ epp = epp_values[ret];
|
||||
+
|
||||
+ if (epp > 0 && policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
|
||||
+ pr_debug("EPP cannot be set under performance policy\n");
|
||||
+ return -EBUSY;
|
||||
+ }
|
||||
+
|
||||
+ ret = amd_pstate_set_epp(policy, epp);
|
||||
|
||||
return ret ? ret : count;
|
||||
}
|
||||
@@ -1267,7 +1192,6 @@ static ssize_t show_energy_performance_p
|
||||
|
||||
static void amd_pstate_driver_cleanup(void)
|
||||
{
|
||||
- amd_pstate_cppc_enable(false);
|
||||
cppc_state = AMD_PSTATE_DISABLE;
|
||||
current_pstate_driver = NULL;
|
||||
}
|
||||
@@ -1301,14 +1225,6 @@ static int amd_pstate_register_driver(in
|
||||
|
||||
cppc_state = mode;
|
||||
|
||||
- ret = amd_pstate_cppc_enable(true);
|
||||
- if (ret) {
|
||||
- pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n",
|
||||
- ret);
|
||||
- amd_pstate_driver_cleanup();
|
||||
- return ret;
|
||||
- }
|
||||
-
|
||||
/* at least one CPU supports CPB */
|
||||
current_pstate_driver->boost_enabled = cpu_feature_enabled(X86_FEATURE_CPB);
|
||||
|
||||
@@ -1548,11 +1464,15 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf,
|
||||
cpudata->nominal_freq,
|
||||
perf.highest_perf);
|
||||
+ policy->driver_data = cpudata;
|
||||
+
|
||||
+ ret = amd_pstate_cppc_enable(policy);
|
||||
+ if (ret)
|
||||
+ goto free_cpudata1;
|
||||
|
||||
/* It will be updated by governor */
|
||||
policy->cur = policy->cpuinfo.min_freq;
|
||||
|
||||
- policy->driver_data = cpudata;
|
||||
|
||||
policy->boost_enabled = READ_ONCE(cpudata->boost_supported);
|
||||
|
||||
@@ -1644,31 +1564,11 @@ static int amd_pstate_epp_set_policy(str
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int amd_pstate_epp_reenable(struct cpufreq_policy *policy)
|
||||
-{
|
||||
- int ret;
|
||||
-
|
||||
- ret = amd_pstate_cppc_enable(true);
|
||||
- if (ret)
|
||||
- pr_err("failed to enable amd pstate during resume, return %d\n", ret);
|
||||
-
|
||||
-
|
||||
- return amd_pstate_epp_update_limit(policy);
|
||||
-}
|
||||
-
|
||||
static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy)
|
||||
{
|
||||
- struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- int ret;
|
||||
-
|
||||
- pr_debug("AMD CPU Core %d going online\n", cpudata->cpu);
|
||||
+ pr_debug("AMD CPU Core %d going online\n", policy->cpu);
|
||||
|
||||
- ret = amd_pstate_epp_reenable(policy);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
- cpudata->suspended = false;
|
||||
-
|
||||
- return 0;
|
||||
+ return amd_pstate_cppc_enable(policy);
|
||||
}
|
||||
|
||||
static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
|
||||
@@ -1686,11 +1586,6 @@ static int amd_pstate_epp_cpu_offline(st
|
||||
static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- int ret;
|
||||
-
|
||||
- /* avoid suspending when EPP is not enabled */
|
||||
- if (cppc_state != AMD_PSTATE_ACTIVE)
|
||||
- return 0;
|
||||
|
||||
/* invalidate to ensure it's rewritten during resume */
|
||||
cpudata->cppc_req_cached = 0;
|
||||
@@ -1698,11 +1593,6 @@ static int amd_pstate_epp_suspend(struct
|
||||
/* set this flag to avoid setting core offline*/
|
||||
cpudata->suspended = true;
|
||||
|
||||
- /* disable CPPC in lowlevel firmware */
|
||||
- ret = amd_pstate_cppc_enable(false);
|
||||
- if (ret)
|
||||
- pr_err("failed to suspend, return %d\n", ret);
|
||||
-
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1711,8 +1601,12 @@ static int amd_pstate_epp_resume(struct
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
|
||||
if (cpudata->suspended) {
|
||||
+ int ret;
|
||||
+
|
||||
/* enable amd pstate from suspend state*/
|
||||
- amd_pstate_epp_reenable(policy);
|
||||
+ ret = amd_pstate_epp_update_limit(policy);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
|
||||
cpudata->suspended = false;
|
||||
}
|
||||
@@ -1727,8 +1621,6 @@ static struct cpufreq_driver amd_pstate_
|
||||
.fast_switch = amd_pstate_fast_switch,
|
||||
.init = amd_pstate_cpu_init,
|
||||
.exit = amd_pstate_cpu_exit,
|
||||
- .suspend = amd_pstate_cpu_suspend,
|
||||
- .resume = amd_pstate_cpu_resume,
|
||||
.set_boost = amd_pstate_set_boost,
|
||||
.update_limits = amd_pstate_update_limits,
|
||||
.name = "amd-pstate",
|
||||
@@ -1895,7 +1787,6 @@ static int __init amd_pstate_init(void)
|
||||
|
||||
global_attr_free:
|
||||
cpufreq_unregister_driver(current_pstate_driver);
|
||||
- amd_pstate_cppc_enable(false);
|
||||
return ret;
|
||||
}
|
||||
device_initcall(amd_pstate_init);
|
@@ -1,105 +0,0 @@
|
||||
From c06cca99a6d74e7a6d6f020dbf982b0b9bf704e6 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:33 -0600
|
||||
Subject: cpufreq/amd-pstate: Stop caching EPP
|
||||
|
||||
EPP values are cached in the cpudata structure per CPU. This is needless
|
||||
though because they are also cached in the CPPC request variable.
|
||||
|
||||
Drop the separate cache for EPP values and always reference the CPPC
|
||||
request variable when needed.
|
||||
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 19 ++++++++++---------
|
||||
drivers/cpufreq/amd-pstate.h | 1 -
|
||||
2 files changed, 10 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -268,8 +268,6 @@ static int msr_update_perf(struct cpufre
|
||||
}
|
||||
|
||||
WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
- if (epp != cpudata->epp_cached)
|
||||
- WRITE_ONCE(cpudata->epp_cached, epp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -318,7 +316,6 @@ static int msr_set_epp(struct cpufreq_po
|
||||
}
|
||||
|
||||
/* update both so that msr_update_perf() can effectively check */
|
||||
- WRITE_ONCE(cpudata->epp_cached, epp);
|
||||
WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
|
||||
return ret;
|
||||
@@ -335,9 +332,12 @@ static int shmem_set_epp(struct cpufreq_
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
struct cppc_perf_ctrls perf_ctrls;
|
||||
+ u8 epp_cached;
|
||||
u64 value;
|
||||
int ret;
|
||||
|
||||
+
|
||||
+ epp_cached = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached);
|
||||
if (trace_amd_pstate_epp_perf_enabled()) {
|
||||
union perf_cached perf = cpudata->perf;
|
||||
|
||||
@@ -348,10 +348,10 @@ static int shmem_set_epp(struct cpufreq_
|
||||
FIELD_GET(AMD_CPPC_MAX_PERF_MASK,
|
||||
cpudata->cppc_req_cached),
|
||||
policy->boost_enabled,
|
||||
- epp != cpudata->epp_cached);
|
||||
+ epp != epp_cached);
|
||||
}
|
||||
|
||||
- if (epp == cpudata->epp_cached)
|
||||
+ if (epp == epp_cached)
|
||||
return 0;
|
||||
|
||||
perf_ctrls.energy_perf = epp;
|
||||
@@ -360,7 +360,6 @@ static int shmem_set_epp(struct cpufreq_
|
||||
pr_debug("failed to set energy perf value (%d)\n", ret);
|
||||
return ret;
|
||||
}
|
||||
- WRITE_ONCE(cpudata->epp_cached, epp);
|
||||
|
||||
value = READ_ONCE(cpudata->cppc_req_cached);
|
||||
value &= ~AMD_CPPC_EPP_PERF_MASK;
|
||||
@@ -1168,9 +1167,11 @@ static ssize_t show_energy_performance_p
|
||||
struct cpufreq_policy *policy, char *buf)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- u8 preference;
|
||||
+ u8 preference, epp;
|
||||
+
|
||||
+ epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached);
|
||||
|
||||
- switch (cpudata->epp_cached) {
|
||||
+ switch (epp) {
|
||||
case AMD_CPPC_EPP_PERFORMANCE:
|
||||
preference = EPP_INDEX_PERFORMANCE;
|
||||
break;
|
||||
@@ -1533,7 +1534,7 @@ static int amd_pstate_epp_update_limit(s
|
||||
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
epp = 0;
|
||||
else
|
||||
- epp = READ_ONCE(cpudata->epp_cached);
|
||||
+ epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached);
|
||||
|
||||
perf = READ_ONCE(cpudata->perf);
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.h
|
||||
+++ b/drivers/cpufreq/amd-pstate.h
|
||||
@@ -102,7 +102,6 @@ struct amd_cpudata {
|
||||
bool hw_prefcore;
|
||||
|
||||
/* EPP feature related attributes*/
|
||||
- u8 epp_cached;
|
||||
u32 policy;
|
||||
bool suspended;
|
||||
u8 epp_default;
|
@@ -1,39 +0,0 @@
|
||||
From a82e4f4eb6e5e9806c66285cb3cefde644b8ea6b Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Wed, 26 Feb 2025 01:49:34 -0600
|
||||
Subject: cpufreq/amd-pstate: Drop actions in amd_pstate_epp_cpu_offline()
|
||||
|
||||
When the CPU goes offline there is no need to change the CPPC request
|
||||
because the CPU will go into the deepest C-state it supports already.
|
||||
|
||||
Actually changing the CPPC request when it goes offline messes up the
|
||||
cached values and can lead to the wrong values being restored when
|
||||
it comes back.
|
||||
|
||||
Instead drop the actions and if the CPU comes back online let
|
||||
amd_pstate_epp_set_policy() restore it to expected values.
|
||||
|
||||
Reviewed-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 9 +--------
|
||||
1 file changed, 1 insertion(+), 8 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1574,14 +1574,7 @@ static int amd_pstate_epp_cpu_online(str
|
||||
|
||||
static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
|
||||
{
|
||||
- struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
-
|
||||
- if (cpudata->suspended)
|
||||
- return 0;
|
||||
-
|
||||
- return amd_pstate_update_perf(policy, perf.lowest_perf, 0, perf.lowest_perf,
|
||||
- AMD_CPPC_EPP_BALANCE_POWERSAVE, false);
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
|
@@ -1,41 +0,0 @@
|
||||
From de3dd387423b30565e846e0ff4424e2c99164030 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <superm1@kernel.org>
|
||||
Date: Thu, 27 Feb 2025 14:09:08 -0600
|
||||
Subject: cpufreq/amd-pstate: fix warning noticed by kernel test robot
|
||||
|
||||
Reported-by: kernel test robot <lkp@intel.com>
|
||||
Closes: https://lore.kernel.org/oe-kbuild-all/202502272001.nafS0qXq-lkp@intel.com/
|
||||
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 13 ++++++-------
|
||||
1 file changed, 6 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -903,20 +903,19 @@ static int amd_pstate_init_freq(struct a
|
||||
return ret;
|
||||
perf = READ_ONCE(cpudata->perf);
|
||||
|
||||
+ if (quirks && quirks->nominal_freq)
|
||||
+ nominal_freq = quirks->nominal_freq;
|
||||
+ else
|
||||
+ nominal_freq = cppc_perf.nominal_freq;
|
||||
+ nominal_freq *= 1000;
|
||||
+
|
||||
if (quirks && quirks->lowest_freq) {
|
||||
min_freq = quirks->lowest_freq;
|
||||
perf.lowest_perf = freq_to_perf(perf, nominal_freq, min_freq);
|
||||
WRITE_ONCE(cpudata->perf, perf);
|
||||
} else
|
||||
min_freq = cppc_perf.lowest_freq;
|
||||
-
|
||||
- if (quirks && quirks->nominal_freq)
|
||||
- nominal_freq = quirks->nominal_freq;
|
||||
- else
|
||||
- nominal_freq = cppc_perf.nominal_freq;
|
||||
-
|
||||
min_freq *= 1000;
|
||||
- nominal_freq *= 1000;
|
||||
|
||||
WRITE_ONCE(cpudata->nominal_freq, nominal_freq);
|
||||
|
@@ -1,42 +0,0 @@
|
||||
From 7e68278a4a90d52966b923404a2d280e3a83b66f Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Date: Mon, 7 Apr 2025 08:19:26 +0000
|
||||
Subject: cpufreq/amd-pstate: Fix min_limit perf and freq updation for
|
||||
performance governor
|
||||
|
||||
The min_limit perf and freq values can get disconnected with performance
|
||||
governor, as we only modify the perf value in the special case. Fix that
|
||||
by modifying the perf and freq values together
|
||||
|
||||
Fixes: 009d1c29a451 ("cpufreq/amd-pstate: Move perf values into a union")
|
||||
Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Link: https://lore.kernel.org/r/20250407081925.850473-1-dhananjay.ugwekar@amd.com
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 11 +++++++----
|
||||
1 file changed, 7 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -607,13 +607,16 @@ static void amd_pstate_update_min_max_li
|
||||
union perf_cached perf = READ_ONCE(cpudata->perf);
|
||||
|
||||
perf.max_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->max);
|
||||
- perf.min_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->min);
|
||||
+ WRITE_ONCE(cpudata->max_limit_freq, policy->max);
|
||||
|
||||
- if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
+ if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
|
||||
perf.min_limit_perf = min(perf.nominal_perf, perf.max_limit_perf);
|
||||
+ WRITE_ONCE(cpudata->min_limit_freq, min(cpudata->nominal_freq, cpudata->max_limit_freq));
|
||||
+ } else {
|
||||
+ perf.min_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->min);
|
||||
+ WRITE_ONCE(cpudata->min_limit_freq, policy->min);
|
||||
+ }
|
||||
|
||||
- WRITE_ONCE(cpudata->max_limit_freq, policy->max);
|
||||
- WRITE_ONCE(cpudata->min_limit_freq, policy->min);
|
||||
WRITE_ONCE(cpudata->perf, perf);
|
||||
}
|
||||
|
@@ -1,4 +1,4 @@
|
||||
From 247749c27f92a789d4f1727aa870167c25ca3c5e Mon Sep 17 00:00:00 2001
|
||||
From 1cb9f09cead0ba384729bfdc74d6fa21d586530c Mon Sep 17 00:00:00 2001
|
||||
From: Christian Loehle <christian.loehle@arm.com>
|
||||
Date: Thu, 5 Sep 2024 10:26:39 +0100
|
||||
Subject: cpuidle: Prefer teo over menu governor
|
||||
@@ -36,7 +36,7 @@ Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
depends on KVM_GUEST
|
||||
--- a/drivers/cpuidle/governors/menu.c
|
||||
+++ b/drivers/cpuidle/governors/menu.c
|
||||
@@ -519,7 +519,7 @@ static int menu_enable_device(struct cpu
|
||||
@@ -513,7 +513,7 @@ static int menu_enable_device(struct cpu
|
||||
|
||||
static struct cpuidle_governor menu_governor = {
|
||||
.name = "menu",
|
||||
|
@@ -1,65 +0,0 @@
|
||||
From 5e5a835c50afc3b9bb2b8b9175d0924abb5a7f3c Mon Sep 17 00:00:00 2001
|
||||
From: Eric Biggers <ebiggers@google.com>
|
||||
Date: Mon, 27 Jan 2025 13:16:09 -0800
|
||||
Subject: crypto: x86/aes-xts - make the fast path 64-bit specific
|
||||
|
||||
Remove 32-bit support from the fast path in xts_crypt(). Then optimize
|
||||
it for 64-bit, and simplify the code, by switching to sg_virt() and
|
||||
removing the now-unnecessary checks for crossing a page boundary.
|
||||
|
||||
The result is simpler code that is slightly smaller and faster in the
|
||||
case that actually matters (64-bit).
|
||||
|
||||
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
||||
---
|
||||
arch/x86/crypto/aesni-intel_glue.c | 30 ++++++++++--------------------
|
||||
1 file changed, 10 insertions(+), 20 deletions(-)
|
||||
|
||||
--- a/arch/x86/crypto/aesni-intel_glue.c
|
||||
+++ b/arch/x86/crypto/aesni-intel_glue.c
|
||||
@@ -581,11 +581,8 @@ xts_crypt(struct skcipher_request *req,
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
|
||||
- const unsigned int cryptlen = req->cryptlen;
|
||||
- struct scatterlist *src = req->src;
|
||||
- struct scatterlist *dst = req->dst;
|
||||
|
||||
- if (unlikely(cryptlen < AES_BLOCK_SIZE))
|
||||
+ if (unlikely(req->cryptlen < AES_BLOCK_SIZE))
|
||||
return -EINVAL;
|
||||
|
||||
kernel_fpu_begin();
|
||||
@@ -593,23 +590,16 @@ xts_crypt(struct skcipher_request *req,
|
||||
|
||||
/*
|
||||
* In practice, virtually all XTS plaintexts and ciphertexts are either
|
||||
- * 512 or 4096 bytes, aligned such that they don't span page boundaries.
|
||||
- * To optimize the performance of these cases, and also any other case
|
||||
- * where no page boundary is spanned, the below fast-path handles
|
||||
- * single-page sources and destinations as efficiently as possible.
|
||||
+ * 512 or 4096 bytes and do not use multiple scatterlist elements. To
|
||||
+ * optimize the performance of these cases, the below fast-path handles
|
||||
+ * single-scatterlist-element messages as efficiently as possible. The
|
||||
+ * code is 64-bit specific, as it assumes no page mapping is needed.
|
||||
*/
|
||||
- if (likely(src->length >= cryptlen && dst->length >= cryptlen &&
|
||||
- src->offset + cryptlen <= PAGE_SIZE &&
|
||||
- dst->offset + cryptlen <= PAGE_SIZE)) {
|
||||
- struct page *src_page = sg_page(src);
|
||||
- struct page *dst_page = sg_page(dst);
|
||||
- void *src_virt = kmap_local_page(src_page) + src->offset;
|
||||
- void *dst_virt = kmap_local_page(dst_page) + dst->offset;
|
||||
-
|
||||
- (*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen,
|
||||
- req->iv);
|
||||
- kunmap_local(dst_virt);
|
||||
- kunmap_local(src_virt);
|
||||
+ if (IS_ENABLED(CONFIG_X86_64) &&
|
||||
+ likely(req->src->length >= req->cryptlen &&
|
||||
+ req->dst->length >= req->cryptlen)) {
|
||||
+ (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src),
|
||||
+ sg_virt(req->dst), req->cryptlen, req->iv);
|
||||
kernel_fpu_end();
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@@ -1,176 +0,0 @@
|
||||
From 4506de20739ac4726a258faa98609a552184d2d2 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Sergio=20Gonz=C3=A1lez=20Collado?=
|
||||
<sergio.collado@gmail.com>
|
||||
Date: Sun, 2 Mar 2025 23:15:18 +0100
|
||||
Subject: Kunit to check the longest symbol length
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
The longest length of a symbol (KSYM_NAME_LEN) was increased to 512
|
||||
in the reference [1]. This patch adds kunit test suite to check the longest
|
||||
symbol length. These tests verify that the longest symbol length defined
|
||||
is supported.
|
||||
|
||||
This test can also help other efforts for longer symbol length,
|
||||
like [2].
|
||||
|
||||
The test suite defines one symbol with the longest possible length.
|
||||
|
||||
The first test verify that functions with names of the created
|
||||
symbol, can be called or not.
|
||||
|
||||
The second test, verify that the symbols are created (or
|
||||
not) in the kernel symbol table.
|
||||
|
||||
[1] https://lore.kernel.org/lkml/20220802015052.10452-6-ojeda@kernel.org/
|
||||
[2] https://lore.kernel.org/lkml/20240605032120.3179157-1-song@kernel.org/
|
||||
|
||||
Tested-by: Martin Rodriguez Reboredo <yakoyoku@gmail.com>
|
||||
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
|
||||
Reviewed-by: Rae Moar <rmoar@google.com>
|
||||
Signed-off-by: Sergio González Collado <sergio.collado@gmail.com>
|
||||
Link: https://github.com/Rust-for-Linux/linux/issues/504
|
||||
Source: https://lore.kernel.org/rust-for-linux/20250302221518.76874-1-sergio.collado@gmail.com/
|
||||
Cherry-picked-for: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/63
|
||||
---
|
||||
arch/x86/tools/insn_decoder_test.c | 3 +-
|
||||
lib/Kconfig.debug | 9 ++++
|
||||
lib/Makefile | 2 +
|
||||
lib/longest_symbol_kunit.c | 82 ++++++++++++++++++++++++++++++
|
||||
4 files changed, 95 insertions(+), 1 deletion(-)
|
||||
create mode 100644 lib/longest_symbol_kunit.c
|
||||
|
||||
--- a/arch/x86/tools/insn_decoder_test.c
|
||||
+++ b/arch/x86/tools/insn_decoder_test.c
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <assert.h>
|
||||
#include <unistd.h>
|
||||
#include <stdarg.h>
|
||||
+#include <linux/kallsyms.h>
|
||||
|
||||
#define unlikely(cond) (cond)
|
||||
|
||||
@@ -106,7 +107,7 @@ static void parse_args(int argc, char **
|
||||
}
|
||||
}
|
||||
|
||||
-#define BUFSIZE 256
|
||||
+#define BUFSIZE (256 + KSYM_NAME_LEN)
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
--- a/lib/Kconfig.debug
|
||||
+++ b/lib/Kconfig.debug
|
||||
@@ -2838,6 +2838,15 @@ config FORTIFY_KUNIT_TEST
|
||||
by the str*() and mem*() family of functions. For testing runtime
|
||||
traps of FORTIFY_SOURCE, see LKDTM's "FORTIFY_*" tests.
|
||||
|
||||
+config LONGEST_SYM_KUNIT_TEST
|
||||
+ tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS
|
||||
+ depends on KUNIT && KPROBES
|
||||
+ default KUNIT_ALL_TESTS
|
||||
+ help
|
||||
+ Tests the longest symbol possible
|
||||
+
|
||||
+ If unsure, say N.
|
||||
+
|
||||
config HW_BREAKPOINT_KUNIT_TEST
|
||||
bool "Test hw_breakpoint constraints accounting" if !KUNIT_ALL_TESTS
|
||||
depends on HAVE_HW_BREAKPOINT
|
||||
--- a/lib/Makefile
|
||||
+++ b/lib/Makefile
|
||||
@@ -398,6 +398,8 @@ obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fort
|
||||
obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o
|
||||
obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o
|
||||
obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o
|
||||
+obj-$(CONFIG_LONGEST_SYM_KUNIT_TEST) += longest_symbol_kunit.o
|
||||
+CFLAGS_longest_symbol_kunit.o += $(call cc-disable-warning, missing-prototypes)
|
||||
|
||||
obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o
|
||||
|
||||
--- /dev/null
|
||||
+++ b/lib/longest_symbol_kunit.c
|
||||
@@ -0,0 +1,82 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0
|
||||
+/*
|
||||
+ * Test the longest symbol length. Execute with:
|
||||
+ * ./tools/testing/kunit/kunit.py run longest-symbol
|
||||
+ * --arch=x86_64 --kconfig_add CONFIG_KPROBES=y --kconfig_add CONFIG_MODULES=y
|
||||
+ * --kconfig_add CONFIG_RETPOLINE=n --kconfig_add CONFIG_CFI_CLANG=n
|
||||
+ * --kconfig_add CONFIG_MITIGATION_RETPOLINE=n
|
||||
+ */
|
||||
+
|
||||
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
+
|
||||
+#include <kunit/test.h>
|
||||
+#include <linux/stringify.h>
|
||||
+#include <linux/kprobes.h>
|
||||
+#include <linux/kallsyms.h>
|
||||
+
|
||||
+#define DI(name) s##name##name
|
||||
+#define DDI(name) DI(n##name##name)
|
||||
+#define DDDI(name) DDI(n##name##name)
|
||||
+#define DDDDI(name) DDDI(n##name##name)
|
||||
+#define DDDDDI(name) DDDDI(n##name##name)
|
||||
+
|
||||
+/*Generate a symbol whose name length is 511 */
|
||||
+#define LONGEST_SYM_NAME DDDDDI(g1h2i3j4k5l6m7n)
|
||||
+
|
||||
+#define RETURN_LONGEST_SYM 0xAAAAA
|
||||
+
|
||||
+noinline int LONGEST_SYM_NAME(void);
|
||||
+noinline int LONGEST_SYM_NAME(void)
|
||||
+{
|
||||
+ return RETURN_LONGEST_SYM;
|
||||
+}
|
||||
+
|
||||
+_Static_assert(sizeof(__stringify(LONGEST_SYM_NAME)) == KSYM_NAME_LEN,
|
||||
+"Incorrect symbol length found. Expected KSYM_NAME_LEN: "
|
||||
+__stringify(KSYM_NAME_LEN) ", but found: "
|
||||
+__stringify(sizeof(LONGEST_SYM_NAME)));
|
||||
+
|
||||
+static void test_longest_symbol(struct kunit *test)
|
||||
+{
|
||||
+ KUNIT_EXPECT_EQ(test, RETURN_LONGEST_SYM, LONGEST_SYM_NAME());
|
||||
+};
|
||||
+
|
||||
+static void test_longest_symbol_kallsyms(struct kunit *test)
|
||||
+{
|
||||
+ unsigned long (*kallsyms_lookup_name)(const char *name);
|
||||
+ static int (*longest_sym)(void);
|
||||
+
|
||||
+ struct kprobe kp = {
|
||||
+ .symbol_name = "kallsyms_lookup_name",
|
||||
+ };
|
||||
+
|
||||
+ if (register_kprobe(&kp) < 0) {
|
||||
+ pr_info("%s: kprobe not registered", __func__);
|
||||
+ KUNIT_FAIL(test, "test_longest_symbol kallsyms: kprobe not registered\n");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ kunit_warn(test, "test_longest_symbol kallsyms: kprobe registered\n");
|
||||
+ kallsyms_lookup_name = (unsigned long (*)(const char *name))kp.addr;
|
||||
+ unregister_kprobe(&kp);
|
||||
+
|
||||
+ longest_sym =
|
||||
+ (void *) kallsyms_lookup_name(__stringify(LONGEST_SYM_NAME));
|
||||
+ KUNIT_EXPECT_EQ(test, RETURN_LONGEST_SYM, longest_sym());
|
||||
+};
|
||||
+
|
||||
+static struct kunit_case longest_symbol_test_cases[] = {
|
||||
+ KUNIT_CASE(test_longest_symbol),
|
||||
+ KUNIT_CASE(test_longest_symbol_kallsyms),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+static struct kunit_suite longest_symbol_test_suite = {
|
||||
+ .name = "longest-symbol",
|
||||
+ .test_cases = longest_symbol_test_cases,
|
||||
+};
|
||||
+kunit_test_suite(longest_symbol_test_suite);
|
||||
+
|
||||
+MODULE_LICENSE("GPL");
|
||||
+MODULE_DESCRIPTION("Test the longest symbol length");
|
||||
+MODULE_AUTHOR("Sergio González Collado");
|
70
debian/patches/patchset-pf/fixes/0001-mm-fix-ratelimit_pages-update-error-in-dirty_ratio_h.patch
vendored
Normal file
70
debian/patches/patchset-pf/fixes/0001-mm-fix-ratelimit_pages-update-error-in-dirty_ratio_h.patch
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
From cda8b1022f32bb7a917148f75f4641e7a5b3e729 Mon Sep 17 00:00:00 2001
|
||||
From: Jinliang Zheng <alexjlzheng@tencent.com>
|
||||
Date: Tue, 15 Apr 2025 17:02:32 +0800
|
||||
Subject: mm: fix ratelimit_pages update error in dirty_ratio_handler()
|
||||
|
||||
In dirty_ratio_handler(), vm_dirty_bytes must be set to zero before
|
||||
calling writeback_set_ratelimit(), as global_dirty_limits() always
|
||||
prioritizes the value of vm_dirty_bytes.
|
||||
|
||||
It's domain_dirty_limits() that's relevant here, not node_dirty_ok:
|
||||
|
||||
dirty_ratio_handler
|
||||
writeback_set_ratelimit
|
||||
global_dirty_limits(&dirty_thresh) <- ratelimit_pages based on dirty_thresh
|
||||
domain_dirty_limits
|
||||
if (bytes) <- bytes = vm_dirty_bytes <--------+
|
||||
thresh = f1(bytes) <- prioritizes vm_dirty_bytes |
|
||||
else |
|
||||
thresh = f2(ratio) |
|
||||
ratelimit_pages = f3(dirty_thresh) |
|
||||
vm_dirty_bytes = 0 <- it's late! ---------------------+
|
||||
|
||||
This causes ratelimit_pages to still use the value calculated based on
|
||||
vm_dirty_bytes, which is wrong now.
|
||||
|
||||
|
||||
The impact visible to userspace is difficult to capture directly because
|
||||
there is no procfs/sysfs interface exported to user space. However, it
|
||||
will have a real impact on the balance of dirty pages.
|
||||
|
||||
For example:
|
||||
|
||||
1. On default, we have vm_dirty_ratio=40, vm_dirty_bytes=0
|
||||
|
||||
2. echo 8192 > dirty_bytes, then vm_dirty_bytes=8192,
|
||||
vm_dirty_ratio=0, and ratelimit_pages is calculated based on
|
||||
vm_dirty_bytes now.
|
||||
|
||||
3. echo 20 > dirty_ratio, then since vm_dirty_bytes is not reset to
|
||||
zero when writeback_set_ratelimit() -> global_dirty_limits() ->
|
||||
domain_dirty_limits() is called, reallimit_pages is still calculated
|
||||
based on vm_dirty_bytes instead of vm_dirty_ratio. This does not
|
||||
conform to the actual intent of the user.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250415090232.7544-1-alexjlzheng@tencent.com
|
||||
Fixes: 9d823e8f6b1b ("writeback: per task dirty rate limit")
|
||||
Signed-off-by: Jinliang Zheng <alexjlzheng@tencent.com>
|
||||
Reviewed-by: MengEn Sun <mengensun@tencent.com>
|
||||
Cc: Andrea Righi <andrea@betterlinux.com>
|
||||
Cc: Fenggaung Wu <fengguang.wu@intel.com>
|
||||
Cc: Jinliang Zheng <alexjlzheng@tencent.com>
|
||||
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/page-writeback.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/page-writeback.c
|
||||
+++ b/mm/page-writeback.c
|
||||
@@ -520,8 +520,8 @@ static int dirty_ratio_handler(const str
|
||||
|
||||
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
|
||||
- writeback_set_ratelimit();
|
||||
vm_dirty_bytes = 0;
|
||||
+ writeback_set_ratelimit();
|
||||
}
|
||||
return ret;
|
||||
}
|
179
debian/patches/patchset-pf/fixes/0002-vgacon-Add-check-for-vc_origin-address-range-in-vgac.patch
vendored
Normal file
179
debian/patches/patchset-pf/fixes/0002-vgacon-Add-check-for-vc_origin-address-range-in-vgac.patch
vendored
Normal file
@@ -0,0 +1,179 @@
|
||||
From 30a724581b5037176f6492359c189ebb180ccf1f Mon Sep 17 00:00:00 2001
|
||||
From: GONG Ruiqi <gongruiqi1@huawei.com>
|
||||
Date: Sun, 27 Apr 2025 10:53:03 +0800
|
||||
Subject: vgacon: Add check for vc_origin address range in vgacon_scroll()
|
||||
|
||||
Our in-house Syzkaller reported the following BUG (twice), which we
|
||||
believed was the same issue with [1]:
|
||||
|
||||
==================================================================
|
||||
BUG: KASAN: slab-out-of-bounds in vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
|
||||
Read of size 2 at addr ffff88800f5bef60 by task syz.7.2620/12393
|
||||
...
|
||||
Call Trace:
|
||||
<TASK>
|
||||
__dump_stack lib/dump_stack.c:88 [inline]
|
||||
dump_stack_lvl+0x72/0xa0 lib/dump_stack.c:106
|
||||
print_address_description.constprop.0+0x6b/0x3d0 mm/kasan/report.c:364
|
||||
print_report+0xba/0x280 mm/kasan/report.c:475
|
||||
kasan_report+0xa9/0xe0 mm/kasan/report.c:588
|
||||
vcs_scr_readw+0xc2/0xd0 drivers/tty/vt/vt.c:4740
|
||||
vcs_write_buf_noattr drivers/tty/vt/vc_screen.c:493 [inline]
|
||||
vcs_write+0x586/0x840 drivers/tty/vt/vc_screen.c:690
|
||||
vfs_write+0x219/0x960 fs/read_write.c:584
|
||||
ksys_write+0x12e/0x260 fs/read_write.c:639
|
||||
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
|
||||
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
|
||||
entry_SYSCALL_64_after_hwframe+0x78/0xe2
|
||||
...
|
||||
</TASK>
|
||||
|
||||
Allocated by task 5614:
|
||||
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
|
||||
kasan_set_track+0x25/0x30 mm/kasan/common.c:52
|
||||
____kasan_kmalloc mm/kasan/common.c:374 [inline]
|
||||
__kasan_kmalloc+0x8f/0xa0 mm/kasan/common.c:383
|
||||
kasan_kmalloc include/linux/kasan.h:201 [inline]
|
||||
__do_kmalloc_node mm/slab_common.c:1007 [inline]
|
||||
__kmalloc+0x62/0x140 mm/slab_common.c:1020
|
||||
kmalloc include/linux/slab.h:604 [inline]
|
||||
kzalloc include/linux/slab.h:721 [inline]
|
||||
vc_do_resize+0x235/0xf40 drivers/tty/vt/vt.c:1193
|
||||
vgacon_adjust_height+0x2d4/0x350 drivers/video/console/vgacon.c:1007
|
||||
vgacon_font_set+0x1f7/0x240 drivers/video/console/vgacon.c:1031
|
||||
con_font_set drivers/tty/vt/vt.c:4628 [inline]
|
||||
con_font_op+0x4da/0xa20 drivers/tty/vt/vt.c:4675
|
||||
vt_k_ioctl+0xa10/0xb30 drivers/tty/vt/vt_ioctl.c:474
|
||||
vt_ioctl+0x14c/0x1870 drivers/tty/vt/vt_ioctl.c:752
|
||||
tty_ioctl+0x655/0x1510 drivers/tty/tty_io.c:2779
|
||||
vfs_ioctl fs/ioctl.c:51 [inline]
|
||||
__do_sys_ioctl fs/ioctl.c:871 [inline]
|
||||
__se_sys_ioctl+0x12d/0x190 fs/ioctl.c:857
|
||||
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
|
||||
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
|
||||
entry_SYSCALL_64_after_hwframe+0x78/0xe2
|
||||
|
||||
Last potentially related work creation:
|
||||
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
|
||||
__kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
|
||||
__call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
|
||||
netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
|
||||
__sock_release+0xb5/0x270 net/socket.c:663
|
||||
sock_close+0x1e/0x30 net/socket.c:1425
|
||||
__fput+0x408/0xab0 fs/file_table.c:384
|
||||
__fput_sync+0x4c/0x60 fs/file_table.c:465
|
||||
__do_sys_close fs/open.c:1580 [inline]
|
||||
__se_sys_close+0x68/0xd0 fs/open.c:1565
|
||||
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
|
||||
do_syscall_64+0x59/0x110 arch/x86/entry/common.c:81
|
||||
entry_SYSCALL_64_after_hwframe+0x78/0xe2
|
||||
|
||||
Second to last potentially related work creation:
|
||||
kasan_save_stack+0x20/0x40 mm/kasan/common.c:45
|
||||
__kasan_record_aux_stack+0x94/0xa0 mm/kasan/generic.c:492
|
||||
__call_rcu_common.constprop.0+0xc3/0xa10 kernel/rcu/tree.c:2713
|
||||
netlink_release+0x620/0xc20 net/netlink/af_netlink.c:802
|
||||
__sock_release+0xb5/0x270 net/socket.c:663
|
||||
sock_close+0x1e/0x30 net/socket.c:1425
|
||||
__fput+0x408/0xab0 fs/file_table.c:384
|
||||
task_work_run+0x154/0x240 kernel/task_work.c:239
|
||||
exit_task_work include/linux/task_work.h:45 [inline]
|
||||
do_exit+0x8e5/0x1320 kernel/exit.c:874
|
||||
do_group_exit+0xcd/0x280 kernel/exit.c:1023
|
||||
get_signal+0x1675/0x1850 kernel/signal.c:2905
|
||||
arch_do_signal_or_restart+0x80/0x3b0 arch/x86/kernel/signal.c:310
|
||||
exit_to_user_mode_loop kernel/entry/common.c:111 [inline]
|
||||
exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline]
|
||||
__syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline]
|
||||
syscall_exit_to_user_mode+0x1b3/0x1e0 kernel/entry/common.c:218
|
||||
do_syscall_64+0x66/0x110 arch/x86/entry/common.c:87
|
||||
entry_SYSCALL_64_after_hwframe+0x78/0xe2
|
||||
|
||||
The buggy address belongs to the object at ffff88800f5be000
|
||||
which belongs to the cache kmalloc-2k of size 2048
|
||||
The buggy address is located 2656 bytes to the right of
|
||||
allocated 1280-byte region [ffff88800f5be000, ffff88800f5be500)
|
||||
|
||||
...
|
||||
|
||||
Memory state around the buggy address:
|
||||
ffff88800f5bee00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
|
||||
ffff88800f5bee80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
|
||||
>ffff88800f5bef00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
|
||||
^
|
||||
ffff88800f5bef80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
|
||||
ffff88800f5bf000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
|
||||
==================================================================
|
||||
|
||||
By analyzing the vmcore, we found that vc->vc_origin was somehow placed
|
||||
one line prior to vc->vc_screenbuf when vc was in KD_TEXT mode, and
|
||||
further writings to /dev/vcs caused out-of-bounds reads (and writes
|
||||
right after) in vcs_write_buf_noattr().
|
||||
|
||||
Our further experiments show that in most cases, vc->vc_origin equals to
|
||||
vga_vram_base when the console is in KD_TEXT mode, and it's around
|
||||
vc->vc_screenbuf for the KD_GRAPHICS mode. But via triggerring a
|
||||
TIOCL_SETVESABLANK ioctl beforehand, we can make vc->vc_origin be around
|
||||
vc->vc_screenbuf while the console is in KD_TEXT mode, and then by
|
||||
writing the special 'ESC M' control sequence to the tty certain times
|
||||
(depends on the value of `vc->state.y - vc->vc_top`), we can eventually
|
||||
move vc->vc_origin prior to vc->vc_screenbuf. Here's the PoC, tested on
|
||||
QEMU:
|
||||
|
||||
```
|
||||
int main() {
|
||||
const int RI_NUM = 10; // should be greater than `vc->state.y - vc->vc_top`
|
||||
int tty_fd, vcs_fd;
|
||||
const char *tty_path = "/dev/tty0";
|
||||
const char *vcs_path = "/dev/vcs";
|
||||
const char escape_seq[] = "\x1bM"; // ESC + M
|
||||
const char trigger_seq[] = "Let's trigger an OOB write.";
|
||||
struct vt_sizes vt_size = { 70, 2 };
|
||||
int blank = TIOCL_BLANKSCREEN;
|
||||
|
||||
tty_fd = open(tty_path, O_RDWR);
|
||||
|
||||
char vesa_mode[] = { TIOCL_SETVESABLANK, 1 };
|
||||
ioctl(tty_fd, TIOCLINUX, vesa_mode);
|
||||
|
||||
ioctl(tty_fd, TIOCLINUX, &blank);
|
||||
ioctl(tty_fd, VT_RESIZE, &vt_size);
|
||||
|
||||
for (int i = 0; i < RI_NUM; ++i)
|
||||
write(tty_fd, escape_seq, sizeof(escape_seq) - 1);
|
||||
|
||||
vcs_fd = open(vcs_path, O_RDWR);
|
||||
write(vcs_fd, trigger_seq, sizeof(trigger_seq));
|
||||
|
||||
close(vcs_fd);
|
||||
close(tty_fd);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
To solve this problem, add an address range validation check in
|
||||
vgacon_scroll(), ensuring vc->vc_origin never precedes vc_screenbuf.
|
||||
|
||||
Reported-by: syzbot+9c09fda97a1a65ea859b@syzkaller.appspotmail.com
|
||||
Closes: https://syzkaller.appspot.com/bug?extid=9c09fda97a1a65ea859b [1]
|
||||
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
|
||||
Cc: stable@vger.kernel.org
|
||||
Co-developed-by: Yi Yang <yiyang13@huawei.com>
|
||||
Signed-off-by: Yi Yang <yiyang13@huawei.com>
|
||||
Signed-off-by: GONG Ruiqi <gongruiqi1@huawei.com>
|
||||
Signed-off-by: Helge Deller <deller@gmx.de>
|
||||
---
|
||||
drivers/video/console/vgacon.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/video/console/vgacon.c
|
||||
+++ b/drivers/video/console/vgacon.c
|
||||
@@ -1168,7 +1168,7 @@ static bool vgacon_scroll(struct vc_data
|
||||
c->vc_screenbuf_size - delta);
|
||||
c->vc_origin = vga_vram_end - c->vc_screenbuf_size;
|
||||
vga_rolled_over = 0;
|
||||
- } else
|
||||
+ } else if (oldo - delta >= (unsigned long)c->vc_screenbuf)
|
||||
c->vc_origin -= delta;
|
||||
c->vc_scr_end = c->vc_origin + c->vc_screenbuf_size;
|
||||
scr_memsetw((u16 *) (c->vc_origin), c->vc_video_erase_char,
|
@@ -1,36 +0,0 @@
|
||||
From b5a4b82efd19d0687a5582a58f6830bf714e34fc Mon Sep 17 00:00:00 2001
|
||||
From: Nathan Chancellor <nathan@kernel.org>
|
||||
Date: Tue, 18 Mar 2025 15:32:30 -0700
|
||||
Subject: x86/tools: Drop duplicate unlikely() definition in
|
||||
insn_decoder_test.c
|
||||
|
||||
After commit c104c16073b7 ("Kunit to check the longest symbol length"),
|
||||
there is a warning when building with clang because there is now a
|
||||
definition of unlikely from compiler.h in tools/include/linux, which
|
||||
conflicts with the one in the instruction decoder selftest:
|
||||
|
||||
arch/x86/tools/insn_decoder_test.c:15:9: warning: 'unlikely' macro redefined [-Wmacro-redefined]
|
||||
|
||||
Remove the second unlikely() definition, as it is no longer necessary,
|
||||
clearing up the warning.
|
||||
|
||||
Fixes: c104c16073b7 ("Kunit to check the longest symbol length")
|
||||
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
|
||||
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||||
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
|
||||
Link: https://lore.kernel.org/r/20250318-x86-decoder-test-fix-unlikely-redef-v1-1-74c84a7bf05b@kernel.org
|
||||
---
|
||||
arch/x86/tools/insn_decoder_test.c | 2 --
|
||||
1 file changed, 2 deletions(-)
|
||||
|
||||
--- a/arch/x86/tools/insn_decoder_test.c
|
||||
+++ b/arch/x86/tools/insn_decoder_test.c
|
||||
@@ -12,8 +12,6 @@
|
||||
#include <stdarg.h>
|
||||
#include <linux/kallsyms.h>
|
||||
|
||||
-#define unlikely(cond) (cond)
|
||||
-
|
||||
#include <asm/insn.h>
|
||||
#include <inat.c>
|
||||
#include <insn.c>
|
102
debian/patches/patchset-pf/fixes/0003-fbdev-Fix-do_register_framebuffer-to-prevent-null-pt.patch
vendored
Normal file
102
debian/patches/patchset-pf/fixes/0003-fbdev-Fix-do_register_framebuffer-to-prevent-null-pt.patch
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
From 5cf26cf9fd9c11cb1543aac026f8928829895663 Mon Sep 17 00:00:00 2001
|
||||
From: Murad Masimov <m.masimov@mt-integration.ru>
|
||||
Date: Mon, 28 Apr 2025 18:34:06 +0300
|
||||
Subject: fbdev: Fix do_register_framebuffer to prevent null-ptr-deref in
|
||||
fb_videomode_to_var
|
||||
|
||||
If fb_add_videomode() in do_register_framebuffer() fails to allocate
|
||||
memory for fb_videomode, it will later lead to a null-ptr dereference in
|
||||
fb_videomode_to_var(), as the fb_info is registered while not having the
|
||||
mode in modelist that is expected to be there, i.e. the one that is
|
||||
described in fb_info->var.
|
||||
|
||||
================================================================
|
||||
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
|
||||
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
|
||||
CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
|
||||
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
|
||||
RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
|
||||
Call Trace:
|
||||
display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
|
||||
fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
|
||||
resize_screen drivers/tty/vt/vt.c:1176 [inline]
|
||||
vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
|
||||
fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
|
||||
fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
|
||||
do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
|
||||
fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
|
||||
vfs_ioctl fs/ioctl.c:48 [inline]
|
||||
__do_sys_ioctl fs/ioctl.c:753 [inline]
|
||||
__se_sys_ioctl fs/ioctl.c:739 [inline]
|
||||
__x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
|
||||
do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
|
||||
entry_SYSCALL_64_after_hwframe+0x67/0xd1
|
||||
================================================================
|
||||
|
||||
Even though fbcon_init() checks beforehand if fb_match_mode() in
|
||||
var_to_display() fails, it can not prevent the panic because fbcon_init()
|
||||
does not return error code. Considering this and the comment in the code
|
||||
about fb_match_mode() returning NULL - "This should not happen" - it is
|
||||
better to prevent registering the fb_info if its mode was not set
|
||||
successfully. Also move fb_add_videomode() closer to the beginning of
|
||||
do_register_framebuffer() to avoid having to do the cleanup on fail.
|
||||
|
||||
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
|
||||
|
||||
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
|
||||
Signed-off-by: Helge Deller <deller@gmx.de>
|
||||
---
|
||||
drivers/video/fbdev/core/fbmem.c | 18 +++++++++++-------
|
||||
1 file changed, 11 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/drivers/video/fbdev/core/fbmem.c
|
||||
+++ b/drivers/video/fbdev/core/fbmem.c
|
||||
@@ -388,7 +388,7 @@ static int fb_check_foreignness(struct f
|
||||
|
||||
static int do_register_framebuffer(struct fb_info *fb_info)
|
||||
{
|
||||
- int i;
|
||||
+ int i, err = 0;
|
||||
struct fb_videomode mode;
|
||||
|
||||
if (fb_check_foreignness(fb_info))
|
||||
@@ -397,10 +397,18 @@ static int do_register_framebuffer(struc
|
||||
if (num_registered_fb == FB_MAX)
|
||||
return -ENXIO;
|
||||
|
||||
- num_registered_fb++;
|
||||
for (i = 0 ; i < FB_MAX; i++)
|
||||
if (!registered_fb[i])
|
||||
break;
|
||||
+
|
||||
+ if (!fb_info->modelist.prev || !fb_info->modelist.next)
|
||||
+ INIT_LIST_HEAD(&fb_info->modelist);
|
||||
+
|
||||
+ fb_var_to_videomode(&mode, &fb_info->var);
|
||||
+ err = fb_add_videomode(&mode, &fb_info->modelist);
|
||||
+ if (err < 0)
|
||||
+ return err;
|
||||
+
|
||||
fb_info->node = i;
|
||||
refcount_set(&fb_info->count, 1);
|
||||
mutex_init(&fb_info->lock);
|
||||
@@ -426,16 +434,12 @@ static int do_register_framebuffer(struc
|
||||
if (bitmap_empty(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT))
|
||||
bitmap_fill(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT);
|
||||
|
||||
- if (!fb_info->modelist.prev || !fb_info->modelist.next)
|
||||
- INIT_LIST_HEAD(&fb_info->modelist);
|
||||
-
|
||||
if (fb_info->skip_vt_switch)
|
||||
pm_vt_switch_required(fb_info->device, false);
|
||||
else
|
||||
pm_vt_switch_required(fb_info->device, true);
|
||||
|
||||
- fb_var_to_videomode(&mode, &fb_info->var);
|
||||
- fb_add_videomode(&mode, &fb_info->modelist);
|
||||
+ num_registered_fb++;
|
||||
registered_fb[i] = fb_info;
|
||||
|
||||
#ifdef CONFIG_GUMSTIX_AM200EPD
|
65
debian/patches/patchset-pf/fixes/0004-fbdev-Fix-fb_set_var-to-prevent-null-ptr-deref-in-fb.patch
vendored
Normal file
65
debian/patches/patchset-pf/fixes/0004-fbdev-Fix-fb_set_var-to-prevent-null-ptr-deref-in-fb.patch
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
From 54c7f478f1a9d58f5609a48d461c7d495bb8301a Mon Sep 17 00:00:00 2001
|
||||
From: Murad Masimov <m.masimov@mt-integration.ru>
|
||||
Date: Mon, 28 Apr 2025 18:34:07 +0300
|
||||
Subject: fbdev: Fix fb_set_var to prevent null-ptr-deref in
|
||||
fb_videomode_to_var
|
||||
|
||||
If fb_add_videomode() in fb_set_var() fails to allocate memory for
|
||||
fb_videomode, later it may lead to a null-ptr dereference in
|
||||
fb_videomode_to_var(), as the fb_info is registered while not having the
|
||||
mode in modelist that is expected to be there, i.e. the one that is
|
||||
described in fb_info->var.
|
||||
|
||||
================================================================
|
||||
general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI
|
||||
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
|
||||
CPU: 1 PID: 30371 Comm: syz-executor.1 Not tainted 5.10.226-syzkaller #0
|
||||
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
|
||||
RIP: 0010:fb_videomode_to_var+0x24/0x610 drivers/video/fbdev/core/modedb.c:901
|
||||
Call Trace:
|
||||
display_to_var+0x3a/0x7c0 drivers/video/fbdev/core/fbcon.c:929
|
||||
fbcon_resize+0x3e2/0x8f0 drivers/video/fbdev/core/fbcon.c:2071
|
||||
resize_screen drivers/tty/vt/vt.c:1176 [inline]
|
||||
vc_do_resize+0x53a/0x1170 drivers/tty/vt/vt.c:1263
|
||||
fbcon_modechanged+0x3ac/0x6e0 drivers/video/fbdev/core/fbcon.c:2720
|
||||
fbcon_update_vcs+0x43/0x60 drivers/video/fbdev/core/fbcon.c:2776
|
||||
do_fb_ioctl+0x6d2/0x740 drivers/video/fbdev/core/fbmem.c:1128
|
||||
fb_ioctl+0xe7/0x150 drivers/video/fbdev/core/fbmem.c:1203
|
||||
vfs_ioctl fs/ioctl.c:48 [inline]
|
||||
__do_sys_ioctl fs/ioctl.c:753 [inline]
|
||||
__se_sys_ioctl fs/ioctl.c:739 [inline]
|
||||
__x64_sys_ioctl+0x19a/0x210 fs/ioctl.c:739
|
||||
do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46
|
||||
entry_SYSCALL_64_after_hwframe+0x67/0xd1
|
||||
================================================================
|
||||
|
||||
The reason is that fb_info->var is being modified in fb_set_var(), and
|
||||
then fb_videomode_to_var() is called. If it fails to add the mode to
|
||||
fb_info->modelist, fb_set_var() returns error, but does not restore the
|
||||
old value of fb_info->var. Restore fb_info->var on failure the same way
|
||||
it is done earlier in the function.
|
||||
|
||||
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
|
||||
|
||||
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Murad Masimov <m.masimov@mt-integration.ru>
|
||||
Signed-off-by: Helge Deller <deller@gmx.de>
|
||||
---
|
||||
drivers/video/fbdev/core/fbmem.c | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/video/fbdev/core/fbmem.c
|
||||
+++ b/drivers/video/fbdev/core/fbmem.c
|
||||
@@ -328,8 +328,10 @@ fb_set_var(struct fb_info *info, struct
|
||||
!list_empty(&info->modelist))
|
||||
ret = fb_add_videomode(&mode, &info->modelist);
|
||||
|
||||
- if (ret)
|
||||
+ if (ret) {
|
||||
+ info->var = old_var;
|
||||
return ret;
|
||||
+ }
|
||||
|
||||
event.info = info;
|
||||
event.data = &mode;
|
@@ -1,40 +0,0 @@
|
||||
From e56acee381a8e07edf1920fb58f3166f911b6e5c Mon Sep 17 00:00:00 2001
|
||||
From: Lingbo Kong <quic_lingbok@quicinc.com>
|
||||
Date: Wed, 26 Feb 2025 19:31:18 +0800
|
||||
Subject: wifi: ath12k: Abort scan before removing link interface to prevent
|
||||
duplicate deletion
|
||||
|
||||
Currently, when ath12k performs the remove link interface operation, if
|
||||
there is an ongoing scan operation on the arvif, ath12k may execute the
|
||||
remove link interface operation multiple times on the same arvif. This
|
||||
occurs because, during the remove link operation, if a scan operation is
|
||||
present on the arvif, ath12k may receive a WMI_SCAN_EVENT_COMPLETED event
|
||||
from the firmware. Upon receiving this event, ath12k will continue to
|
||||
execute the ath12k_scan_vdev_clean_work() function, performing the remove
|
||||
link interface operation on the same arvif again.
|
||||
|
||||
To address this issue, before executing the remove link interface
|
||||
operation, ath12k needs to check if there is an ongoing scan operation on
|
||||
the current arvif. If such an operation exists, it should be aborted.
|
||||
|
||||
Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.0.c5-00481-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3
|
||||
|
||||
Signed-off-by: Lingbo Kong <quic_lingbok@quicinc.com>
|
||||
---
|
||||
drivers/net/wireless/ath/ath12k/mac.c | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
--- a/drivers/net/wireless/ath/ath12k/mac.c
|
||||
+++ b/drivers/net/wireless/ath/ath12k/mac.c
|
||||
@@ -9395,6 +9395,11 @@ ath12k_mac_op_unassign_vif_chanctx(struc
|
||||
ar->num_started_vdevs == 1 && ar->monitor_vdev_created)
|
||||
ath12k_mac_monitor_stop(ar);
|
||||
|
||||
+ if (ar->scan.arvif == arvif && ar->scan.state == ATH12K_SCAN_RUNNING) {
|
||||
+ ath12k_scan_abort(ar);
|
||||
+ ar->scan.arvif = NULL;
|
||||
+ }
|
||||
+
|
||||
ath12k_mac_remove_link_interface(hw, arvif);
|
||||
ath12k_mac_unassign_link_vif(arvif);
|
||||
}
|
@@ -1,49 +0,0 @@
|
||||
From 8d0e02f81d08c7b1e082028af0f55a22e7e1dfb2 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Brauner <brauner@kernel.org>
|
||||
Date: Tue, 15 Apr 2025 10:22:04 +0200
|
||||
Subject: Kconfig: switch CONFIG_SYSFS_SYCALL default to n
|
||||
|
||||
This odd system call will be removed in the future. Let's decouple it
|
||||
from CONFIG_EXPERT and switch the default to n as a first step.
|
||||
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
init/Kconfig | 20 ++++++++++----------
|
||||
1 file changed, 10 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/init/Kconfig
|
||||
+++ b/init/Kconfig
|
||||
@@ -1603,6 +1603,16 @@ config SYSCTL_ARCH_UNALIGN_ALLOW
|
||||
the unaligned access emulation.
|
||||
see arch/parisc/kernel/unaligned.c for reference
|
||||
|
||||
+config SYSFS_SYSCALL
|
||||
+ bool "Sysfs syscall support"
|
||||
+ default n
|
||||
+ help
|
||||
+ sys_sysfs is an obsolete system call no longer supported in libc.
|
||||
+ Note that disabling this option is more secure but might break
|
||||
+ compatibility with some systems.
|
||||
+
|
||||
+ If unsure say N here.
|
||||
+
|
||||
config HAVE_PCSPKR_PLATFORM
|
||||
bool
|
||||
|
||||
@@ -1647,16 +1657,6 @@ config SGETMASK_SYSCALL
|
||||
|
||||
If unsure, leave the default option here.
|
||||
|
||||
-config SYSFS_SYSCALL
|
||||
- bool "Sysfs syscall support" if EXPERT
|
||||
- default y
|
||||
- help
|
||||
- sys_sysfs is an obsolete system call no longer supported in libc.
|
||||
- Note that disabling this option is more secure but might break
|
||||
- compatibility with some systems.
|
||||
-
|
||||
- If unsure say Y here.
|
||||
-
|
||||
config FHANDLE
|
||||
bool "open by fhandle syscalls" if EXPERT
|
||||
select EXPORTFS
|
113
debian/patches/patchset-pf/fixes/0005-anon_inode-use-a-proper-mode-internally.patch
vendored
Normal file
113
debian/patches/patchset-pf/fixes/0005-anon_inode-use-a-proper-mode-internally.patch
vendored
Normal file
@@ -0,0 +1,113 @@
|
||||
From 9cb2f9d210f915aabe54c5061d84f3fbe93c71ea Mon Sep 17 00:00:00 2001
|
||||
From: Christian Brauner <brauner@kernel.org>
|
||||
Date: Mon, 7 Apr 2025 11:54:15 +0200
|
||||
Subject: anon_inode: use a proper mode internally
|
||||
|
||||
This allows the VFS to not trip over anonymous inodes and we can add
|
||||
asserts based on the mode into the vfs. When we report it to userspace
|
||||
we can simply hide the mode to avoid regressions. I've audited all
|
||||
direct callers of alloc_anon_inode() and only secretmen overrides i_mode
|
||||
and i_op inode operations but it already uses a regular file.
|
||||
|
||||
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-1-53a44c20d44e@kernel.org
|
||||
Fixes: af153bb63a336 ("vfs: catch invalid modes in may_open()")
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Cc: stable@vger.kernel.org # all LTS kernels
|
||||
Reported-by: syzbot+5d8e79d323a13aa0b248@syzkaller.appspotmail.com
|
||||
Closes: https://lore.kernel.org/all/67ed3fb3.050a0220.14623d.0009.GAE@google.com
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
fs/anon_inodes.c | 36 ++++++++++++++++++++++++++++++++++++
|
||||
fs/internal.h | 3 +++
|
||||
fs/libfs.c | 8 +++++++-
|
||||
3 files changed, 46 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/anon_inodes.c
|
||||
+++ b/fs/anon_inodes.c
|
||||
@@ -24,10 +24,44 @@
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
+#include "internal.h"
|
||||
+
|
||||
static struct vfsmount *anon_inode_mnt __ro_after_init;
|
||||
static struct inode *anon_inode_inode __ro_after_init;
|
||||
|
||||
/*
|
||||
+ * User space expects anonymous inodes to have no file type in st_mode.
|
||||
+ *
|
||||
+ * In particular, 'lsof' has this legacy logic:
|
||||
+ *
|
||||
+ * type = s->st_mode & S_IFMT;
|
||||
+ * switch (type) {
|
||||
+ * ...
|
||||
+ * case 0:
|
||||
+ * if (!strcmp(p, "anon_inode"))
|
||||
+ * Lf->ntype = Ntype = N_ANON_INODE;
|
||||
+ *
|
||||
+ * to detect our old anon_inode logic.
|
||||
+ *
|
||||
+ * Rather than mess with our internal sane inode data, just fix it
|
||||
+ * up here in getattr() by masking off the format bits.
|
||||
+ */
|
||||
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
|
||||
+ struct kstat *stat, u32 request_mask,
|
||||
+ unsigned int query_flags)
|
||||
+{
|
||||
+ struct inode *inode = d_inode(path->dentry);
|
||||
+
|
||||
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
|
||||
+ stat->mode &= ~S_IFMT;
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct inode_operations anon_inode_operations = {
|
||||
+ .getattr = anon_inode_getattr,
|
||||
+};
|
||||
+
|
||||
+/*
|
||||
* anon_inodefs_dname() is called from d_path().
|
||||
*/
|
||||
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
|
||||
@@ -66,6 +100,7 @@ static struct inode *anon_inode_make_sec
|
||||
if (IS_ERR(inode))
|
||||
return inode;
|
||||
inode->i_flags &= ~S_PRIVATE;
|
||||
+ inode->i_op = &anon_inode_operations;
|
||||
error = security_inode_init_security_anon(inode, &QSTR(name),
|
||||
context_inode);
|
||||
if (error) {
|
||||
@@ -313,6 +348,7 @@ static int __init anon_inode_init(void)
|
||||
anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
|
||||
if (IS_ERR(anon_inode_inode))
|
||||
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
|
||||
+ anon_inode_inode->i_op = &anon_inode_operations;
|
||||
|
||||
return 0;
|
||||
}
|
||||
--- a/fs/internal.h
|
||||
+++ b/fs/internal.h
|
||||
@@ -343,3 +343,6 @@ static inline bool path_mounted(const st
|
||||
void file_f_owner_release(struct file *file);
|
||||
bool file_seek_cur_needs_f_lock(struct file *file);
|
||||
int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
|
||||
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
|
||||
+ struct kstat *stat, u32 request_mask,
|
||||
+ unsigned int query_flags);
|
||||
--- a/fs/libfs.c
|
||||
+++ b/fs/libfs.c
|
||||
@@ -1647,7 +1647,13 @@ struct inode *alloc_anon_inode(struct su
|
||||
* that it already _is_ on the dirty list.
|
||||
*/
|
||||
inode->i_state = I_DIRTY;
|
||||
- inode->i_mode = S_IRUSR | S_IWUSR;
|
||||
+ /*
|
||||
+ * Historically anonymous inodes didn't have a type at all and
|
||||
+ * userspace has come to rely on this. Internally they're just
|
||||
+ * regular files but S_IFREG is masked off when reporting
|
||||
+ * information to userspace.
|
||||
+ */
|
||||
+ inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
|
||||
inode->i_uid = current_fsuid();
|
||||
inode->i_gid = current_fsgid();
|
||||
inode->i_flags |= S_PRIVATE;
|
80
debian/patches/patchset-pf/fixes/0006-anon_inode-explicitly-block-setattr.patch
vendored
Normal file
80
debian/patches/patchset-pf/fixes/0006-anon_inode-explicitly-block-setattr.patch
vendored
Normal file
@@ -0,0 +1,80 @@
|
||||
From ea4199112ae6d8da866417f50e035be01488c502 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Brauner <brauner@kernel.org>
|
||||
Date: Mon, 7 Apr 2025 11:54:17 +0200
|
||||
Subject: anon_inode: explicitly block ->setattr()
|
||||
|
||||
It is currently possible to change the mode and owner of the single
|
||||
anonymous inode in the kernel:
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int ret, sfd;
|
||||
sigset_t mask;
|
||||
struct signalfd_siginfo fdsi;
|
||||
|
||||
sigemptyset(&mask);
|
||||
sigaddset(&mask, SIGINT);
|
||||
sigaddset(&mask, SIGQUIT);
|
||||
|
||||
ret = sigprocmask(SIG_BLOCK, &mask, NULL);
|
||||
if (ret < 0)
|
||||
_exit(1);
|
||||
|
||||
sfd = signalfd(-1, &mask, 0);
|
||||
if (sfd < 0)
|
||||
_exit(2);
|
||||
|
||||
ret = fchown(sfd, 5555, 5555);
|
||||
if (ret < 0)
|
||||
_exit(3);
|
||||
|
||||
ret = fchmod(sfd, 0777);
|
||||
if (ret < 0)
|
||||
_exit(3);
|
||||
|
||||
_exit(4);
|
||||
}
|
||||
|
||||
This is a bug. It's not really a meaningful one because anonymous inodes
|
||||
don't really figure into path lookup and they cannot be reopened via
|
||||
/proc/<pid>/fd/<nr> and can't be used for lookup itself. So they can
|
||||
only ever serve as direct references.
|
||||
|
||||
But it is still completely bogus to allow the mode and ownership or any
|
||||
of the properties of the anonymous inode to be changed. Block this!
|
||||
|
||||
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-3-53a44c20d44e@kernel.org
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Cc: stable@vger.kernel.org # all LTS kernels
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
fs/anon_inodes.c | 7 +++++++
|
||||
fs/internal.h | 2 ++
|
||||
2 files changed, 9 insertions(+)
|
||||
|
||||
--- a/fs/anon_inodes.c
|
||||
+++ b/fs/anon_inodes.c
|
||||
@@ -57,8 +57,15 @@ int anon_inode_getattr(struct mnt_idmap
|
||||
return 0;
|
||||
}
|
||||
|
||||
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
|
||||
+ struct iattr *attr)
|
||||
+{
|
||||
+ return -EOPNOTSUPP;
|
||||
+}
|
||||
+
|
||||
static const struct inode_operations anon_inode_operations = {
|
||||
.getattr = anon_inode_getattr,
|
||||
+ .setattr = anon_inode_setattr,
|
||||
};
|
||||
|
||||
/*
|
||||
--- a/fs/internal.h
|
||||
+++ b/fs/internal.h
|
||||
@@ -346,3 +346,5 @@ int statmount_mnt_idmap(struct mnt_idmap
|
||||
int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
|
||||
struct kstat *stat, u32 request_mask,
|
||||
unsigned int query_flags);
|
||||
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
|
||||
+ struct iattr *attr);
|
39
debian/patches/patchset-pf/fixes/0007-anon_inode-raise-SB_I_NODEV-and-SB_I_NOEXEC.patch
vendored
Normal file
39
debian/patches/patchset-pf/fixes/0007-anon_inode-raise-SB_I_NODEV-and-SB_I_NOEXEC.patch
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
From 79f54c5bc7c6097a379c83e9ed56bee27cf1218a Mon Sep 17 00:00:00 2001
|
||||
From: Christian Brauner <brauner@kernel.org>
|
||||
Date: Mon, 7 Apr 2025 11:54:19 +0200
|
||||
Subject: anon_inode: raise SB_I_NODEV and SB_I_NOEXEC
|
||||
|
||||
It isn't possible to execute anonymous inodes because they cannot be
|
||||
opened in any way after they have been created. This includes execution:
|
||||
|
||||
execveat(fd_anon_inode, "", NULL, NULL, AT_EMPTY_PATH)
|
||||
|
||||
Anonymous inodes have inode->f_op set to no_open_fops which sets
|
||||
no_open() which returns ENXIO. That means any call to do_dentry_open()
|
||||
which is the endpoint of the do_open_execat() will fail. There's no
|
||||
chance to execute an anonymous inode. Unless a given subsystem overrides
|
||||
it ofc.
|
||||
|
||||
However, we should still harden this and raise SB_I_NODEV and
|
||||
SB_I_NOEXEC on the superblock itself so that no one gets any creative
|
||||
ideas.
|
||||
|
||||
Link: https://lore.kernel.org/20250407-work-anon_inode-v1-5-53a44c20d44e@kernel.org
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Cc: stable@vger.kernel.org # all LTS kernels
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
fs/anon_inodes.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/fs/anon_inodes.c
|
||||
+++ b/fs/anon_inodes.c
|
||||
@@ -86,6 +86,8 @@ static int anon_inodefs_init_fs_context(
|
||||
struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
|
||||
if (!ctx)
|
||||
return -ENOMEM;
|
||||
+ fc->s_iflags |= SB_I_NOEXEC;
|
||||
+ fc->s_iflags |= SB_I_NODEV;
|
||||
ctx->dops = &anon_inodefs_dentry_operations;
|
||||
return 0;
|
||||
}
|
136
debian/patches/patchset-pf/fixes/0008-fs-add-S_ANON_INODE.patch
vendored
Normal file
136
debian/patches/patchset-pf/fixes/0008-fs-add-S_ANON_INODE.patch
vendored
Normal file
@@ -0,0 +1,136 @@
|
||||
From edaacbee0f33b7371ec460723d1042a6c5a4bb9d Mon Sep 17 00:00:00 2001
|
||||
From: Christian Brauner <brauner@kernel.org>
|
||||
Date: Mon, 21 Apr 2025 10:27:40 +0200
|
||||
Subject: fs: add S_ANON_INODE
|
||||
|
||||
This makes it easy to detect proper anonymous inodes and to ensure that
|
||||
we can detect them in codepaths such as readahead().
|
||||
|
||||
Readahead on anonymous inodes didn't work because they didn't have a
|
||||
proper mode. Now that they have we need to retain EINVAL being returned
|
||||
otherwise LTP will fail.
|
||||
|
||||
We also need to ensure that ioctls aren't simply fired like they are for
|
||||
regular files so things like inotify inodes continue to correctly call
|
||||
their own ioctl handlers as in [1].
|
||||
|
||||
Reported-by: Xilin Wu <sophon@radxa.com>
|
||||
Link: https://lore.kernel.org/3A9139D5CD543962+89831381-31b9-4392-87ec-a84a5b3507d8@radxa.com [1]
|
||||
Link: https://lore.kernel.org/7a1a7076-ff6b-4cb0-94e7-7218a0a44028@sirena.org.uk
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
fs/ioctl.c | 7 ++++---
|
||||
fs/libfs.c | 2 +-
|
||||
fs/pidfs.c | 2 +-
|
||||
include/linux/fs.h | 2 ++
|
||||
mm/readahead.c | 20 ++++++++++++++++----
|
||||
5 files changed, 24 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/fs/ioctl.c
|
||||
+++ b/fs/ioctl.c
|
||||
@@ -821,7 +821,8 @@ static int do_vfs_ioctl(struct file *fil
|
||||
return ioctl_fioasync(fd, filp, argp);
|
||||
|
||||
case FIOQSIZE:
|
||||
- if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
|
||||
+ if (S_ISDIR(inode->i_mode) ||
|
||||
+ (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) ||
|
||||
S_ISLNK(inode->i_mode)) {
|
||||
loff_t res = inode_get_bytes(inode);
|
||||
return copy_to_user(argp, &res, sizeof(res)) ?
|
||||
@@ -856,7 +857,7 @@ static int do_vfs_ioctl(struct file *fil
|
||||
return ioctl_file_dedupe_range(filp, argp);
|
||||
|
||||
case FIONREAD:
|
||||
- if (!S_ISREG(inode->i_mode))
|
||||
+ if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode))
|
||||
return vfs_ioctl(filp, cmd, arg);
|
||||
|
||||
return put_user(i_size_read(inode) - filp->f_pos,
|
||||
@@ -881,7 +882,7 @@ static int do_vfs_ioctl(struct file *fil
|
||||
return ioctl_get_fs_sysfs_path(filp, argp);
|
||||
|
||||
default:
|
||||
- if (S_ISREG(inode->i_mode))
|
||||
+ if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode))
|
||||
return file_ioctl(filp, cmd, argp);
|
||||
break;
|
||||
}
|
||||
--- a/fs/libfs.c
|
||||
+++ b/fs/libfs.c
|
||||
@@ -1656,7 +1656,7 @@ struct inode *alloc_anon_inode(struct su
|
||||
inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
|
||||
inode->i_uid = current_fsuid();
|
||||
inode->i_gid = current_fsgid();
|
||||
- inode->i_flags |= S_PRIVATE;
|
||||
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
|
||||
simple_inode_init_ts(inode);
|
||||
return inode;
|
||||
}
|
||||
--- a/fs/pidfs.c
|
||||
+++ b/fs/pidfs.c
|
||||
@@ -826,7 +826,7 @@ static int pidfs_init_inode(struct inode
|
||||
const struct pid *pid = data;
|
||||
|
||||
inode->i_private = data;
|
||||
- inode->i_flags |= S_PRIVATE;
|
||||
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
|
||||
inode->i_mode |= S_IRWXU;
|
||||
inode->i_op = &pidfs_inode_operations;
|
||||
inode->i_fop = &pidfs_file_operations;
|
||||
--- a/include/linux/fs.h
|
||||
+++ b/include/linux/fs.h
|
||||
@@ -2344,6 +2344,7 @@ struct super_operations {
|
||||
#define S_CASEFOLD (1 << 15) /* Casefolded file */
|
||||
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
|
||||
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
|
||||
+#define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */
|
||||
|
||||
/*
|
||||
* Note that nosuid etc flags are inode-specific: setting some file-system
|
||||
@@ -2400,6 +2401,7 @@ static inline bool sb_rdonly(const struc
|
||||
|
||||
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
|
||||
(inode)->i_rdev == WHITEOUT_DEV)
|
||||
+#define IS_ANON_FILE(inode) ((inode)->i_flags & S_ANON_INODE)
|
||||
|
||||
static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
|
||||
struct inode *inode)
|
||||
--- a/mm/readahead.c
|
||||
+++ b/mm/readahead.c
|
||||
@@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
|
||||
|
||||
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
|
||||
{
|
||||
+ struct file *file;
|
||||
+ const struct inode *inode;
|
||||
+
|
||||
CLASS(fd, f)(fd);
|
||||
+ if (fd_empty(f))
|
||||
+ return -EBADF;
|
||||
|
||||
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
|
||||
+ file = fd_file(f);
|
||||
+ if (!(file->f_mode & FMODE_READ))
|
||||
return -EBADF;
|
||||
|
||||
/*
|
||||
@@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t of
|
||||
* that can execute readahead. If readahead is not possible
|
||||
* on this file, then we must return -EINVAL.
|
||||
*/
|
||||
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
|
||||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
|
||||
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
|
||||
+ if (!file->f_mapping)
|
||||
+ return -EINVAL;
|
||||
+ if (!file->f_mapping->a_ops)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ inode = file_inode(file);
|
||||
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
|
||||
+ return -EINVAL;
|
||||
+ if (IS_ANON_FILE(inode))
|
||||
return -EINVAL;
|
||||
|
||||
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
|
35
debian/patches/patchset-pf/fixes/0009-configfs-Do-not-override-creating-attribute-file-fai.patch
vendored
Normal file
35
debian/patches/patchset-pf/fixes/0009-configfs-Do-not-override-creating-attribute-file-fai.patch
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
From ab287d709809b6dfe4d3c42016a543d976533d51 Mon Sep 17 00:00:00 2001
|
||||
From: Zijun Hu <quic_zijuhu@quicinc.com>
|
||||
Date: Wed, 7 May 2025 19:50:26 +0800
|
||||
Subject: configfs: Do not override creating attribute file failure in
|
||||
populate_attrs()
|
||||
|
||||
populate_attrs() may override failure for creating attribute files
|
||||
by success for creating subsequent bin attribute files, and have
|
||||
wrong return value.
|
||||
|
||||
Fix by creating bin attribute files under successfully creating
|
||||
attribute files.
|
||||
|
||||
Fixes: 03607ace807b ("configfs: implement binary attributes")
|
||||
Cc: stable@vger.kernel.org
|
||||
Reviewed-by: Joel Becker <jlbec@evilplan.org>
|
||||
Reviewed-by: Breno Leitao <leitao@debian.org>
|
||||
Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
|
||||
Link: https://lore.kernel.org/r/20250507-fix_configfs-v3-2-fe2d96de8dc4@quicinc.com
|
||||
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
|
||||
---
|
||||
fs/configfs/dir.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/configfs/dir.c
|
||||
+++ b/fs/configfs/dir.c
|
||||
@@ -619,7 +619,7 @@ static int populate_attrs(struct config_
|
||||
break;
|
||||
}
|
||||
}
|
||||
- if (t->ct_bin_attrs) {
|
||||
+ if (!error && t->ct_bin_attrs) {
|
||||
for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
|
||||
if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i))
|
||||
continue;
|
104
debian/patches/patchset-pf/fixes/0010-Don-t-propagate-mounts-into-detached-trees.patch
vendored
Normal file
104
debian/patches/patchset-pf/fixes/0010-Don-t-propagate-mounts-into-detached-trees.patch
vendored
Normal file
@@ -0,0 +1,104 @@
|
||||
From 896b7b0d6ed53a7fe159c4b76f25407c816aa619 Mon Sep 17 00:00:00 2001
|
||||
From: Al Viro <viro@zeniv.linux.org.uk>
|
||||
Date: Fri, 23 May 2025 19:20:36 -0400
|
||||
Subject: Don't propagate mounts into detached trees
|
||||
|
||||
All versions up to 6.14 did not propagate mount events into detached
|
||||
tree. Shortly after 6.14 a merge of vfs-6.15-rc1.mount.namespace
|
||||
(130e696aa68b) has changed that.
|
||||
|
||||
Unfortunately, that has caused userland regressions (reported in
|
||||
https://lore.kernel.org/all/CAOYeF9WQhFDe+BGW=Dp5fK8oRy5AgZ6zokVyTj1Wp4EUiYgt4w@mail.gmail.com/)
|
||||
|
||||
Straight revert wouldn't be an option - in particular, the variant in 6.14
|
||||
had a bug that got fixed in d1ddc6f1d9f0 ("fix IS_MNT_PROPAGATING uses")
|
||||
and we don't want to bring the bug back.
|
||||
|
||||
This is a modification of manual revert posted by Christian, with changes
|
||||
needed to avoid reintroducing the breakage in scenario described in
|
||||
d1ddc6f1d9f0.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Reported-by: Allison Karlitskaya <lis@redhat.com>
|
||||
Tested-by: Allison Karlitskaya <lis@redhat.com>
|
||||
Acked-by: Christian Brauner <brauner@kernel.org>
|
||||
Co-developed-by: Christian Brauner <brauner@kernel.org>
|
||||
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
||||
---
|
||||
fs/mount.h | 5 -----
|
||||
fs/namespace.c | 15 ++-------------
|
||||
fs/pnode.c | 4 ++--
|
||||
3 files changed, 4 insertions(+), 20 deletions(-)
|
||||
|
||||
--- a/fs/mount.h
|
||||
+++ b/fs/mount.h
|
||||
@@ -7,10 +7,6 @@
|
||||
|
||||
extern struct list_head notify_list;
|
||||
|
||||
-typedef __u32 __bitwise mntns_flags_t;
|
||||
-
|
||||
-#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0))
|
||||
-
|
||||
struct mnt_namespace {
|
||||
struct ns_common ns;
|
||||
struct mount * root;
|
||||
@@ -37,7 +33,6 @@ struct mnt_namespace {
|
||||
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
|
||||
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
|
||||
refcount_t passive; /* number references not pinning @mounts */
|
||||
- mntns_flags_t mntns_flags;
|
||||
} __randomize_layout;
|
||||
|
||||
struct mnt_pcp {
|
||||
--- a/fs/namespace.c
|
||||
+++ b/fs/namespace.c
|
||||
@@ -3648,7 +3648,7 @@ static int do_move_mount(struct path *ol
|
||||
if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
|
||||
goto out;
|
||||
|
||||
- if (is_anon_ns(ns)) {
|
||||
+ if (is_anon_ns(ns) && ns == p->mnt_ns) {
|
||||
/*
|
||||
* Ending up with two files referring to the root of the
|
||||
* same anonymous mount namespace would cause an error
|
||||
@@ -3656,16 +3656,7 @@ static int do_move_mount(struct path *ol
|
||||
* twice into the mount tree which would be rejected
|
||||
* later. But be explicit about it right here.
|
||||
*/
|
||||
- if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns))
|
||||
- goto out;
|
||||
-
|
||||
- /*
|
||||
- * If this is an anonymous mount tree ensure that mount
|
||||
- * propagation can detect mounts that were just
|
||||
- * propagated to the target mount tree so we don't
|
||||
- * propagate onto them.
|
||||
- */
|
||||
- ns->mntns_flags |= MNTNS_PROPAGATING;
|
||||
+ goto out;
|
||||
} else if (is_anon_ns(p->mnt_ns)) {
|
||||
/*
|
||||
* Don't allow moving an attached mount tree to an
|
||||
@@ -3722,8 +3713,6 @@ static int do_move_mount(struct path *ol
|
||||
if (attached)
|
||||
put_mountpoint(old_mp);
|
||||
out:
|
||||
- if (is_anon_ns(ns))
|
||||
- ns->mntns_flags &= ~MNTNS_PROPAGATING;
|
||||
unlock_mount(mp);
|
||||
if (!err) {
|
||||
if (attached) {
|
||||
--- a/fs/pnode.c
|
||||
+++ b/fs/pnode.c
|
||||
@@ -231,8 +231,8 @@ static int propagate_one(struct mount *m
|
||||
/* skip if mountpoint isn't visible in m */
|
||||
if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
|
||||
return 0;
|
||||
- /* skip if m is in the anon_ns we are emptying */
|
||||
- if (m->mnt_ns->mntns_flags & MNTNS_PROPAGATING)
|
||||
+ /* skip if m is in the anon_ns */
|
||||
+ if (is_anon_ns(m->mnt_ns))
|
||||
return 0;
|
||||
|
||||
if (peers(m, last_dest)) {
|
51
debian/patches/patchset-pf/fixes/0011-mm-filemap-gate-dropbehind-invalidate-on-folio-dirty.patch
vendored
Normal file
51
debian/patches/patchset-pf/fixes/0011-mm-filemap-gate-dropbehind-invalidate-on-folio-dirty.patch
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
From bc86aaf0e0256220ca787fdbb57a73429ade1129 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Tue, 27 May 2025 07:28:52 -0600
|
||||
Subject: mm/filemap: gate dropbehind invalidate on folio !dirty && !writeback
|
||||
|
||||
It's possible for the folio to either get marked for writeback or
|
||||
redirtied. Add a helper, filemap_end_dropbehind(), which guards the
|
||||
folio_unmap_invalidate() call behind check for the folio being both
|
||||
non-dirty and not under writeback AFTER the folio lock has been
|
||||
acquired. Use this helper folio_end_dropbehind_write().
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
|
||||
Fixes: fb7d3bc41493 ("mm/filemap: drop streaming/uncached pages when writeback completes")
|
||||
Link: https://lore.kernel.org/linux-fsdevel/20250525083209.GS2023217@ZenIV/
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
Link: https://lore.kernel.org/20250527133255.452431-2-axboe@kernel.dk
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
mm/filemap.c | 13 +++++++++++--
|
||||
1 file changed, 11 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/mm/filemap.c
|
||||
+++ b/mm/filemap.c
|
||||
@@ -1589,6 +1589,16 @@ int folio_wait_private_2_killable(struct
|
||||
}
|
||||
EXPORT_SYMBOL(folio_wait_private_2_killable);
|
||||
|
||||
+static void filemap_end_dropbehind(struct folio *folio)
|
||||
+{
|
||||
+ struct address_space *mapping = folio->mapping;
|
||||
+
|
||||
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
|
||||
+
|
||||
+ if (mapping && !folio_test_writeback(folio) && !folio_test_dirty(folio))
|
||||
+ folio_unmap_invalidate(mapping, folio, 0);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* If folio was marked as dropbehind, then pages should be dropped when writeback
|
||||
* completes. Do that now. If we fail, it's likely because of a big folio -
|
||||
@@ -1604,8 +1614,7 @@ static void folio_end_dropbehind_write(s
|
||||
* invalidation in that case.
|
||||
*/
|
||||
if (in_task() && folio_trylock(folio)) {
|
||||
- if (folio->mapping)
|
||||
- folio_unmap_invalidate(folio->mapping, folio, 0);
|
||||
+ filemap_end_dropbehind(folio);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
}
|
51
debian/patches/patchset-pf/fixes/0012-mm-filemap-use-filemap_end_dropbehind-for-read-inval.patch
vendored
Normal file
51
debian/patches/patchset-pf/fixes/0012-mm-filemap-use-filemap_end_dropbehind-for-read-inval.patch
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
From fad76185ca91983990c660642151083eb05cbfc0 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Tue, 27 May 2025 07:28:53 -0600
|
||||
Subject: mm/filemap: use filemap_end_dropbehind() for read invalidation
|
||||
|
||||
Use the filemap_end_dropbehind() helper rather than calling
|
||||
folio_unmap_invalidate() directly, as we need to check if the folio has
|
||||
been redirtied or marked for writeback once the folio lock has been
|
||||
re-acquired.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Reported-by: Trond Myklebust <trondmy@hammerspace.com>
|
||||
Fixes: 8026e49bff9b ("mm/filemap: add read support for RWF_DONTCACHE")
|
||||
Link: https://lore.kernel.org/linux-fsdevel/ba8a9805331ce258a622feaca266b163db681a10.camel@hammerspace.com/
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
Link: https://lore.kernel.org/20250527133255.452431-3-axboe@kernel.dk
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
mm/filemap.c | 7 +++----
|
||||
1 file changed, 3 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/mm/filemap.c
|
||||
+++ b/mm/filemap.c
|
||||
@@ -2644,8 +2644,7 @@ static inline bool pos_same_folio(loff_t
|
||||
return (pos1 >> shift == pos2 >> shift);
|
||||
}
|
||||
|
||||
-static void filemap_end_dropbehind_read(struct address_space *mapping,
|
||||
- struct folio *folio)
|
||||
+static void filemap_end_dropbehind_read(struct folio *folio)
|
||||
{
|
||||
if (!folio_test_dropbehind(folio))
|
||||
return;
|
||||
@@ -2653,7 +2652,7 @@ static void filemap_end_dropbehind_read(
|
||||
return;
|
||||
if (folio_trylock(folio)) {
|
||||
if (folio_test_clear_dropbehind(folio))
|
||||
- folio_unmap_invalidate(mapping, folio, 0);
|
||||
+ filemap_end_dropbehind(folio);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
}
|
||||
@@ -2774,7 +2773,7 @@ put_folios:
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
struct folio *folio = fbatch.folios[i];
|
||||
|
||||
- filemap_end_dropbehind_read(mapping, folio);
|
||||
+ filemap_end_dropbehind_read(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
folio_batch_init(&fbatch);
|
29
debian/patches/patchset-pf/fixes/0013-Revert-Disable-FOP_DONTCACHE-for-now-due-to-bugs.patch
vendored
Normal file
29
debian/patches/patchset-pf/fixes/0013-Revert-Disable-FOP_DONTCACHE-for-now-due-to-bugs.patch
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
From f0579d45f2e03fa3ba0d9466e79a31ea37acb487 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Tue, 27 May 2025 07:28:54 -0600
|
||||
Subject: Revert "Disable FOP_DONTCACHE for now due to bugs"
|
||||
|
||||
This reverts commit 478ad02d6844217cc7568619aeb0809d93ade43d.
|
||||
|
||||
Both the read and write side dirty && writeback races should be resolved
|
||||
now, revert the commit that disabled FOP_DONTCACHE for filesystems.
|
||||
|
||||
Link: https://lore.kernel.org/linux-fsdevel/20250525083209.GS2023217@ZenIV/
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
Link: https://lore.kernel.org/20250527133255.452431-4-axboe@kernel.dk
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
include/linux/fs.h | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/include/linux/fs.h
|
||||
+++ b/include/linux/fs.h
|
||||
@@ -2186,7 +2186,7 @@ struct file_operations {
|
||||
/* Supports asynchronous lock callbacks */
|
||||
#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6))
|
||||
/* File system supports uncached read/write buffered IO */
|
||||
-#define FOP_DONTCACHE 0 /* ((__force fop_flags_t)(1 << 7)) */
|
||||
+#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7))
|
||||
|
||||
/* Wrap a directory iterator that needs exclusive inode access */
|
||||
int wrap_directory_iterator(struct file *, struct dir_context *,
|
36
debian/patches/patchset-pf/fixes/0014-mm-filemap-unify-read-write-dropbehind-naming.patch
vendored
Normal file
36
debian/patches/patchset-pf/fixes/0014-mm-filemap-unify-read-write-dropbehind-naming.patch
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
From 3b4614564770691cf3a6eb88127268ef6a84180c Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Tue, 27 May 2025 07:28:55 -0600
|
||||
Subject: mm/filemap: unify read/write dropbehind naming
|
||||
|
||||
The read side is filemap_end_dropbehind_read(), while the write side
|
||||
used folio_ as the prefix rather than filemap_. The read side makes more
|
||||
sense, unify the naming such that the write side follows that.
|
||||
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
Link: https://lore.kernel.org/20250527133255.452431-5-axboe@kernel.dk
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
mm/filemap.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/mm/filemap.c
|
||||
+++ b/mm/filemap.c
|
||||
@@ -1604,7 +1604,7 @@ static void filemap_end_dropbehind(struc
|
||||
* completes. Do that now. If we fail, it's likely because of a big folio -
|
||||
* just reset dropbehind for that case and latter completions should invalidate.
|
||||
*/
|
||||
-static void folio_end_dropbehind_write(struct folio *folio)
|
||||
+static void filemap_end_dropbehind_write(struct folio *folio)
|
||||
{
|
||||
/*
|
||||
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
|
||||
@@ -1659,7 +1659,7 @@ void folio_end_writeback(struct folio *f
|
||||
acct_reclaim_writeback(folio);
|
||||
|
||||
if (folio_dropbehind)
|
||||
- folio_end_dropbehind_write(folio);
|
||||
+ filemap_end_dropbehind_write(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
EXPORT_SYMBOL(folio_end_writeback);
|
78
debian/patches/patchset-pf/fixes/0015-mm-filemap-unify-dropbehind-flag-testing-and-clearin.patch
vendored
Normal file
78
debian/patches/patchset-pf/fixes/0015-mm-filemap-unify-dropbehind-flag-testing-and-clearin.patch
vendored
Normal file
@@ -0,0 +1,78 @@
|
||||
From 6003153e1bc4ad4952773081d7b89aa1ab2274c3 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Tue, 27 May 2025 07:28:56 -0600
|
||||
Subject: mm/filemap: unify dropbehind flag testing and clearing
|
||||
|
||||
The read and write side does this a bit differently, unify it such that
|
||||
the _{read,write} helpers check the bit before locking, and the generic
|
||||
handler is in charge of clearing the bit and invalidating, once under
|
||||
the folio lock.
|
||||
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
Link: https://lore.kernel.org/20250527133255.452431-6-axboe@kernel.dk
|
||||
Signed-off-by: Christian Brauner <brauner@kernel.org>
|
||||
---
|
||||
mm/filemap.c | 21 +++++++++++----------
|
||||
1 file changed, 11 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/mm/filemap.c
|
||||
+++ b/mm/filemap.c
|
||||
@@ -1595,7 +1595,11 @@ static void filemap_end_dropbehind(struc
|
||||
|
||||
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
|
||||
|
||||
- if (mapping && !folio_test_writeback(folio) && !folio_test_dirty(folio))
|
||||
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
|
||||
+ return;
|
||||
+ if (!folio_test_clear_dropbehind(folio))
|
||||
+ return;
|
||||
+ if (mapping)
|
||||
folio_unmap_invalidate(mapping, folio, 0);
|
||||
}
|
||||
|
||||
@@ -1606,6 +1610,9 @@ static void filemap_end_dropbehind(struc
|
||||
*/
|
||||
static void filemap_end_dropbehind_write(struct folio *folio)
|
||||
{
|
||||
+ if (!folio_test_dropbehind(folio))
|
||||
+ return;
|
||||
+
|
||||
/*
|
||||
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
|
||||
* but can happen if normal writeback just happens to find dirty folios
|
||||
@@ -1629,8 +1636,6 @@ static void filemap_end_dropbehind_write
|
||||
*/
|
||||
void folio_end_writeback(struct folio *folio)
|
||||
{
|
||||
- bool folio_dropbehind = false;
|
||||
-
|
||||
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
|
||||
|
||||
/*
|
||||
@@ -1652,14 +1657,11 @@ void folio_end_writeback(struct folio *f
|
||||
* reused before the folio_wake_bit().
|
||||
*/
|
||||
folio_get(folio);
|
||||
- if (!folio_test_dirty(folio))
|
||||
- folio_dropbehind = folio_test_clear_dropbehind(folio);
|
||||
if (__folio_end_writeback(folio))
|
||||
folio_wake_bit(folio, PG_writeback);
|
||||
- acct_reclaim_writeback(folio);
|
||||
|
||||
- if (folio_dropbehind)
|
||||
- filemap_end_dropbehind_write(folio);
|
||||
+ filemap_end_dropbehind_write(folio);
|
||||
+ acct_reclaim_writeback(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
EXPORT_SYMBOL(folio_end_writeback);
|
||||
@@ -2651,8 +2653,7 @@ static void filemap_end_dropbehind_read(
|
||||
if (folio_test_writeback(folio) || folio_test_dirty(folio))
|
||||
return;
|
||||
if (folio_trylock(folio)) {
|
||||
- if (folio_test_clear_dropbehind(folio))
|
||||
- filemap_end_dropbehind(folio);
|
||||
+ filemap_end_dropbehind(folio);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
}
|
98
debian/patches/patchset-pf/fixes/0016-mm-khugepaged-fix-race-with-folio-split-free-using-t.patch
vendored
Normal file
98
debian/patches/patchset-pf/fixes/0016-mm-khugepaged-fix-race-with-folio-split-free-using-t.patch
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
From 61c0b2450f2b85c5053fa4f71d9c619b34d3af6c Mon Sep 17 00:00:00 2001
|
||||
From: Shivank Garg <shivankg@amd.com>
|
||||
Date: Mon, 26 May 2025 18:28:18 +0000
|
||||
Subject: mm/khugepaged: fix race with folio split/free using temporary
|
||||
reference
|
||||
|
||||
hpage_collapse_scan_file() calls is_refcount_suitable(), which in turn
|
||||
calls folio_mapcount(). folio_mapcount() checks folio_test_large() before
|
||||
proceeding to folio_large_mapcount(), but there is a race window where the
|
||||
folio may get split/freed between these checks, triggering:
|
||||
|
||||
VM_WARN_ON_FOLIO(!folio_test_large(folio), folio)
|
||||
|
||||
Take a temporary reference to the folio in hpage_collapse_scan_file().
|
||||
This stabilizes the folio during refcount check and prevents incorrect
|
||||
large folio detection due to concurrent split/free. Use helper
|
||||
folio_expected_ref_count() + 1 to compare with folio_ref_count() instead
|
||||
of using is_refcount_suitable().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250526182818.37978-1-shivankg@amd.com
|
||||
Fixes: 05c5323b2a34 ("mm: track mapcount of large folios in single value")
|
||||
Signed-off-by: Shivank Garg <shivankg@amd.com>
|
||||
Reported-by: syzbot+2b99589e33edbe9475ca@syzkaller.appspotmail.com
|
||||
Closes: https://lore.kernel.org/all/6828470d.a70a0220.38f255.000c.GAE@google.com
|
||||
Suggested-by: David Hildenbrand <david@redhat.com>
|
||||
Acked-by: David Hildenbrand <david@redhat.com>
|
||||
Acked-by: Dev Jain <dev.jain@arm.com>
|
||||
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
|
||||
Cc: Bharata B Rao <bharata@amd.com>
|
||||
Cc: Fengwei Yin <fengwei.yin@intel.com>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Cc: Mariano Pache <npache@redhat.com>
|
||||
Cc: Ryan Roberts <ryan.roberts@arm.com>
|
||||
Cc: Zi Yan <ziy@nvidia.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/khugepaged.c | 18 +++++++++++++++++-
|
||||
1 file changed, 17 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/khugepaged.c
|
||||
+++ b/mm/khugepaged.c
|
||||
@@ -2295,6 +2295,17 @@ static int hpage_collapse_scan_file(stru
|
||||
continue;
|
||||
}
|
||||
|
||||
+ if (!folio_try_get(folio)) {
|
||||
+ xas_reset(&xas);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ if (unlikely(folio != xas_reload(&xas))) {
|
||||
+ folio_put(folio);
|
||||
+ xas_reset(&xas);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
if (folio_order(folio) == HPAGE_PMD_ORDER &&
|
||||
folio->index == start) {
|
||||
/* Maybe PMD-mapped */
|
||||
@@ -2305,23 +2316,27 @@ static int hpage_collapse_scan_file(stru
|
||||
* it's safe to skip LRU and refcount checks before
|
||||
* returning.
|
||||
*/
|
||||
+ folio_put(folio);
|
||||
break;
|
||||
}
|
||||
|
||||
node = folio_nid(folio);
|
||||
if (hpage_collapse_scan_abort(node, cc)) {
|
||||
result = SCAN_SCAN_ABORT;
|
||||
+ folio_put(folio);
|
||||
break;
|
||||
}
|
||||
cc->node_load[node]++;
|
||||
|
||||
if (!folio_test_lru(folio)) {
|
||||
result = SCAN_PAGE_LRU;
|
||||
+ folio_put(folio);
|
||||
break;
|
||||
}
|
||||
|
||||
- if (!is_refcount_suitable(folio)) {
|
||||
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
|
||||
result = SCAN_PAGE_COUNT;
|
||||
+ folio_put(folio);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2333,6 +2348,7 @@ static int hpage_collapse_scan_file(stru
|
||||
*/
|
||||
|
||||
present += folio_nr_pages(folio);
|
||||
+ folio_put(folio);
|
||||
|
||||
if (need_resched()) {
|
||||
xas_pause(&xas);
|
198
debian/patches/patchset-pf/fixes/0017-mm-add-folio_expected_ref_count-for-reference-count-.patch
vendored
Normal file
198
debian/patches/patchset-pf/fixes/0017-mm-add-folio_expected_ref_count-for-reference-count-.patch
vendored
Normal file
@@ -0,0 +1,198 @@
|
||||
From 214092002cbd9945b7cc6314e76ec42b3f588c01 Mon Sep 17 00:00:00 2001
|
||||
From: Shivank Garg <shivankg@amd.com>
|
||||
Date: Wed, 30 Apr 2025 10:01:51 +0000
|
||||
Subject: mm: add folio_expected_ref_count() for reference count calculation
|
||||
|
||||
Patch series " JFS: Implement migrate_folio for jfs_metapage_aops" v5.
|
||||
|
||||
This patchset addresses a warning that occurs during memory compaction due
|
||||
to JFS's missing migrate_folio operation. The warning was introduced by
|
||||
commit 7ee3647243e5 ("migrate: Remove call to ->writepage") which added
|
||||
explicit warnings when filesystem don't implement migrate_folio.
|
||||
|
||||
The syzbot reported following [1]:
|
||||
jfs_metapage_aops does not implement migrate_folio
|
||||
WARNING: CPU: 1 PID: 5861 at mm/migrate.c:955 fallback_migrate_folio mm/migrate.c:953 [inline]
|
||||
WARNING: CPU: 1 PID: 5861 at mm/migrate.c:955 move_to_new_folio+0x70e/0x840 mm/migrate.c:1007
|
||||
Modules linked in:
|
||||
CPU: 1 UID: 0 PID: 5861 Comm: syz-executor280 Not tainted 6.15.0-rc1-next-20250411-syzkaller #0 PREEMPT(full)
|
||||
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025
|
||||
RIP: 0010:fallback_migrate_folio mm/migrate.c:953 [inline]
|
||||
RIP: 0010:move_to_new_folio+0x70e/0x840 mm/migrate.c:1007
|
||||
|
||||
To fix this issue, this series implement metapage_migrate_folio() for JFS
|
||||
which handles both single and multiple metapages per page configurations.
|
||||
|
||||
While most filesystems leverage existing migration implementations like
|
||||
filemap_migrate_folio(), buffer_migrate_folio_norefs() or
|
||||
buffer_migrate_folio() (which internally used folio_expected_refs()),
|
||||
JFS's metapage architecture requires special handling of its private data
|
||||
during migration. To support this, this series introduce the
|
||||
folio_expected_ref_count(), which calculates external references to a
|
||||
folio from page/swap cache, private data, and page table mappings.
|
||||
|
||||
This standardized implementation replaces the previous ad-hoc
|
||||
folio_expected_refs() function and enables JFS to accurately determine
|
||||
whether a folio has unexpected references before attempting migration.
|
||||
|
||||
|
||||
|
||||
|
||||
Implement folio_expected_ref_count() to calculate expected folio reference
|
||||
counts from:
|
||||
- Page/swap cache (1 per page)
|
||||
- Private data (1)
|
||||
- Page table mappings (1 per map)
|
||||
|
||||
While originally needed for page migration operations, this improved
|
||||
implementation standardizes reference counting by consolidating all
|
||||
refcount contributors into a single, reusable function that can benefit
|
||||
any subsystem needing to detect unexpected references to folios.
|
||||
|
||||
The folio_expected_ref_count() returns the sum of these external
|
||||
references without including any reference the caller itself might hold.
|
||||
Callers comparing against the actual folio_ref_count() must account for
|
||||
their own references separately.
|
||||
|
||||
Link: https://syzkaller.appspot.com/bug?extid=8bb6fd945af4e0ad9299 [1]
|
||||
Link: https://lkml.kernel.org/r/20250430100150.279751-1-shivankg@amd.com
|
||||
Link: https://lkml.kernel.org/r/20250430100150.279751-2-shivankg@amd.com
|
||||
Signed-off-by: David Hildenbrand <david@redhat.com>
|
||||
Signed-off-by: Shivank Garg <shivankg@amd.com>
|
||||
Suggested-by: Matthew Wilcox <willy@infradead.org>
|
||||
Co-developed-by: David Hildenbrand <david@redhat.com>
|
||||
Cc: Alistair Popple <apopple@nvidia.com>
|
||||
Cc: Dave Kleikamp <shaggy@kernel.org>
|
||||
Cc: Donet Tom <donettom@linux.ibm.com>
|
||||
Cc: Jane Chu <jane.chu@oracle.com>
|
||||
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
|
||||
Cc: Zi Yan <ziy@nvidia.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
|
||||
mm/migrate.c | 22 ++++---------------
|
||||
2 files changed, 59 insertions(+), 18 deletions(-)
|
||||
|
||||
--- a/include/linux/mm.h
|
||||
+++ b/include/linux/mm.h
|
||||
@@ -2307,6 +2307,61 @@ static inline bool folio_maybe_mapped_sh
|
||||
return folio_test_large_maybe_mapped_shared(folio);
|
||||
}
|
||||
|
||||
+/**
|
||||
+ * folio_expected_ref_count - calculate the expected folio refcount
|
||||
+ * @folio: the folio
|
||||
+ *
|
||||
+ * Calculate the expected folio refcount, taking references from the pagecache,
|
||||
+ * swapcache, PG_private and page table mappings into account. Useful in
|
||||
+ * combination with folio_ref_count() to detect unexpected references (e.g.,
|
||||
+ * GUP or other temporary references).
|
||||
+ *
|
||||
+ * Does currently not consider references from the LRU cache. If the folio
|
||||
+ * was isolated from the LRU (which is the case during migration or split),
|
||||
+ * the LRU cache does not apply.
|
||||
+ *
|
||||
+ * Calling this function on an unmapped folio -- !folio_mapped() -- that is
|
||||
+ * locked will return a stable result.
|
||||
+ *
|
||||
+ * Calling this function on a mapped folio will not result in a stable result,
|
||||
+ * because nothing stops additional page table mappings from coming (e.g.,
|
||||
+ * fork()) or going (e.g., munmap()).
|
||||
+ *
|
||||
+ * Calling this function without the folio lock will also not result in a
|
||||
+ * stable result: for example, the folio might get dropped from the swapcache
|
||||
+ * concurrently.
|
||||
+ *
|
||||
+ * However, even when called without the folio lock or on a mapped folio,
|
||||
+ * this function can be used to detect unexpected references early (for example,
|
||||
+ * if it makes sense to even lock the folio and unmap it).
|
||||
+ *
|
||||
+ * The caller must add any reference (e.g., from folio_try_get()) it might be
|
||||
+ * holding itself to the result.
|
||||
+ *
|
||||
+ * Returns the expected folio refcount.
|
||||
+ */
|
||||
+static inline int folio_expected_ref_count(const struct folio *folio)
|
||||
+{
|
||||
+ const int order = folio_order(folio);
|
||||
+ int ref_count = 0;
|
||||
+
|
||||
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
|
||||
+ return 0;
|
||||
+
|
||||
+ if (folio_test_anon(folio)) {
|
||||
+ /* One reference per page from the swapcache. */
|
||||
+ ref_count += folio_test_swapcache(folio) << order;
|
||||
+ } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) {
|
||||
+ /* One reference per page from the pagecache. */
|
||||
+ ref_count += !!folio->mapping << order;
|
||||
+ /* One reference from PG_private. */
|
||||
+ ref_count += folio_test_private(folio);
|
||||
+ }
|
||||
+
|
||||
+ /* One reference per page table mapping. */
|
||||
+ return ref_count + folio_mapcount(folio);
|
||||
+}
|
||||
+
|
||||
#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
|
||||
static inline int arch_make_folio_accessible(struct folio *folio)
|
||||
{
|
||||
--- a/mm/migrate.c
|
||||
+++ b/mm/migrate.c
|
||||
@@ -445,20 +445,6 @@ unlock:
|
||||
}
|
||||
#endif
|
||||
|
||||
-static int folio_expected_refs(struct address_space *mapping,
|
||||
- struct folio *folio)
|
||||
-{
|
||||
- int refs = 1;
|
||||
- if (!mapping)
|
||||
- return refs;
|
||||
-
|
||||
- refs += folio_nr_pages(folio);
|
||||
- if (folio_test_private(folio))
|
||||
- refs++;
|
||||
-
|
||||
- return refs;
|
||||
-}
|
||||
-
|
||||
/*
|
||||
* Replace the folio in the mapping.
|
||||
*
|
||||
@@ -601,7 +587,7 @@ static int __folio_migrate_mapping(struc
|
||||
int folio_migrate_mapping(struct address_space *mapping,
|
||||
struct folio *newfolio, struct folio *folio, int extra_count)
|
||||
{
|
||||
- int expected_count = folio_expected_refs(mapping, folio) + extra_count;
|
||||
+ int expected_count = folio_expected_ref_count(folio) + extra_count + 1;
|
||||
|
||||
if (folio_ref_count(folio) != expected_count)
|
||||
return -EAGAIN;
|
||||
@@ -618,7 +604,7 @@ int migrate_huge_page_move_mapping(struc
|
||||
struct folio *dst, struct folio *src)
|
||||
{
|
||||
XA_STATE(xas, &mapping->i_pages, folio_index(src));
|
||||
- int rc, expected_count = folio_expected_refs(mapping, src);
|
||||
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
|
||||
|
||||
if (folio_ref_count(src) != expected_count)
|
||||
return -EAGAIN;
|
||||
@@ -749,7 +735,7 @@ static int __migrate_folio(struct addres
|
||||
struct folio *src, void *src_private,
|
||||
enum migrate_mode mode)
|
||||
{
|
||||
- int rc, expected_count = folio_expected_refs(mapping, src);
|
||||
+ int rc, expected_count = folio_expected_ref_count(src) + 1;
|
||||
|
||||
/* Check whether src does not have extra refs before we do more work */
|
||||
if (folio_ref_count(src) != expected_count)
|
||||
@@ -837,7 +823,7 @@ static int __buffer_migrate_folio(struct
|
||||
return migrate_folio(mapping, dst, src, mode);
|
||||
|
||||
/* Check whether page does not have extra refs before we do more work */
|
||||
- expected_count = folio_expected_refs(mapping, src);
|
||||
+ expected_count = folio_expected_ref_count(src) + 1;
|
||||
if (folio_ref_count(src) != expected_count)
|
||||
return -EAGAIN;
|
||||
|
129
debian/patches/patchset-pf/fixes/0018-mm-fix-uprobe-pte-be-overwritten-when-expanding-vma.patch
vendored
Normal file
129
debian/patches/patchset-pf/fixes/0018-mm-fix-uprobe-pte-be-overwritten-when-expanding-vma.patch
vendored
Normal file
@@ -0,0 +1,129 @@
|
||||
From 0f52f05148589fe4115322a9cc8ffab760091a0a Mon Sep 17 00:00:00 2001
|
||||
From: Pu Lehui <pulehui@huawei.com>
|
||||
Date: Thu, 29 May 2025 15:56:47 +0000
|
||||
Subject: mm: fix uprobe pte be overwritten when expanding vma
|
||||
|
||||
Patch series "Fix uprobe pte be overwritten when expanding vma".
|
||||
|
||||
|
||||
This patch (of 4):
|
||||
|
||||
We encountered a BUG alert triggered by Syzkaller as follows:
|
||||
BUG: Bad rss-counter state mm:00000000b4a60fca type:MM_ANONPAGES val:1
|
||||
|
||||
And we can reproduce it with the following steps:
|
||||
1. register uprobe on file at zero offset
|
||||
2. mmap the file at zero offset:
|
||||
addr1 = mmap(NULL, 2 * 4096, PROT_NONE, MAP_PRIVATE, fd, 0);
|
||||
3. mremap part of vma1 to new vma2:
|
||||
addr2 = mremap(addr1, 4096, 2 * 4096, MREMAP_MAYMOVE);
|
||||
4. mremap back to orig addr1:
|
||||
mremap(addr2, 4096, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, addr1);
|
||||
|
||||
In step 3, the vma1 range [addr1, addr1 + 4096] will be remap to new vma2
|
||||
with range [addr2, addr2 + 8192], and remap uprobe anon page from the vma1
|
||||
to vma2, then unmap the vma1 range [addr1, addr1 + 4096].
|
||||
|
||||
In step 4, the vma2 range [addr2, addr2 + 4096] will be remap back to the
|
||||
addr range [addr1, addr1 + 4096]. Since the addr range [addr1 + 4096,
|
||||
addr1 + 8192] still maps the file, it will take vma_merge_new_range to
|
||||
expand the range, and then do uprobe_mmap in vma_complete. Since the
|
||||
merged vma pgoff is also zero offset, it will install uprobe anon page to
|
||||
the merged vma. However, the upcomming move_page_tables step, which use
|
||||
set_pte_at to remap the vma2 uprobe pte to the merged vma, will overwrite
|
||||
the newly uprobe pte in the merged vma, and lead that pte to be orphan.
|
||||
|
||||
Since the uprobe pte will be remapped to the merged vma, we can remove the
|
||||
unnecessary uprobe_mmap upon merged vma.
|
||||
|
||||
This problem was first found in linux-6.6.y and also exists in the
|
||||
community syzkaller:
|
||||
https://lore.kernel.org/all/000000000000ada39605a5e71711@google.com/T/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250529155650.4017699-1-pulehui@huaweicloud.com
|
||||
Link: https://lkml.kernel.org/r/20250529155650.4017699-2-pulehui@huaweicloud.com
|
||||
Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints")
|
||||
Signed-off-by: Pu Lehui <pulehui@huawei.com>
|
||||
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Acked-by: David Hildenbrand <david@redhat.com>
|
||||
Cc: Jann Horn <jannh@google.com>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
|
||||
Cc: Oleg Nesterov <oleg@redhat.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vma.c | 20 +++++++++++++++++---
|
||||
mm/vma.h | 7 +++++++
|
||||
2 files changed, 24 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/mm/vma.c
|
||||
+++ b/mm/vma.c
|
||||
@@ -144,6 +144,9 @@ static void init_multi_vma_prep(struct v
|
||||
vp->file = vma->vm_file;
|
||||
if (vp->file)
|
||||
vp->mapping = vma->vm_file->f_mapping;
|
||||
+
|
||||
+ if (vmg && vmg->skip_vma_uprobe)
|
||||
+ vp->skip_vma_uprobe = true;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -333,10 +336,13 @@ static void vma_complete(struct vma_prep
|
||||
|
||||
if (vp->file) {
|
||||
i_mmap_unlock_write(vp->mapping);
|
||||
- uprobe_mmap(vp->vma);
|
||||
|
||||
- if (vp->adj_next)
|
||||
- uprobe_mmap(vp->adj_next);
|
||||
+ if (!vp->skip_vma_uprobe) {
|
||||
+ uprobe_mmap(vp->vma);
|
||||
+
|
||||
+ if (vp->adj_next)
|
||||
+ uprobe_mmap(vp->adj_next);
|
||||
+ }
|
||||
}
|
||||
|
||||
if (vp->remove) {
|
||||
@@ -1783,6 +1789,14 @@ struct vm_area_struct *copy_vma(struct v
|
||||
faulted_in_anon_vma = false;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * If the VMA we are copying might contain a uprobe PTE, ensure
|
||||
+ * that we do not establish one upon merge. Otherwise, when mremap()
|
||||
+ * moves page tables, it will orphan the newly created PTE.
|
||||
+ */
|
||||
+ if (vma->vm_file)
|
||||
+ vmg.skip_vma_uprobe = true;
|
||||
+
|
||||
new_vma = find_vma_prev(mm, addr, &vmg.prev);
|
||||
if (new_vma && new_vma->vm_start < addr + len)
|
||||
return NULL; /* should never get here */
|
||||
--- a/mm/vma.h
|
||||
+++ b/mm/vma.h
|
||||
@@ -19,6 +19,8 @@ struct vma_prepare {
|
||||
struct vm_area_struct *insert;
|
||||
struct vm_area_struct *remove;
|
||||
struct vm_area_struct *remove2;
|
||||
+
|
||||
+ bool skip_vma_uprobe :1;
|
||||
};
|
||||
|
||||
struct unlink_vma_file_batch {
|
||||
@@ -120,6 +122,11 @@ struct vma_merge_struct {
|
||||
*/
|
||||
bool give_up_on_oom :1;
|
||||
|
||||
+ /*
|
||||
+ * If set, skip uprobe_mmap upon merged vma.
|
||||
+ */
|
||||
+ bool skip_vma_uprobe :1;
|
||||
+
|
||||
/* Internal flags set during merge process: */
|
||||
|
||||
/*
|
217
debian/patches/patchset-pf/fixes/0019-mm-hugetlb-unshare-page-tables-during-VMA-split-not-.patch
vendored
Normal file
217
debian/patches/patchset-pf/fixes/0019-mm-hugetlb-unshare-page-tables-during-VMA-split-not-.patch
vendored
Normal file
@@ -0,0 +1,217 @@
|
||||
From 6f1e03b94f7777323aaefd9286d992a1cbd0adf7 Mon Sep 17 00:00:00 2001
|
||||
From: Jann Horn <jannh@google.com>
|
||||
Date: Tue, 27 May 2025 23:23:53 +0200
|
||||
Subject: mm/hugetlb: unshare page tables during VMA split, not before
|
||||
|
||||
Currently, __split_vma() triggers hugetlb page table unsharing through
|
||||
vm_ops->may_split(). This happens before the VMA lock and rmap locks are
|
||||
taken - which is too early, it allows racing VMA-locked page faults in our
|
||||
process and racing rmap walks from other processes to cause page tables to
|
||||
be shared again before we actually perform the split.
|
||||
|
||||
Fix it by explicitly calling into the hugetlb unshare logic from
|
||||
__split_vma() in the same place where THP splitting also happens. At that
|
||||
point, both the VMA and the rmap(s) are write-locked.
|
||||
|
||||
An annoying detail is that we can now call into the helper
|
||||
hugetlb_unshare_pmds() from two different locking contexts:
|
||||
|
||||
1. from hugetlb_split(), holding:
|
||||
- mmap lock (exclusively)
|
||||
- VMA lock
|
||||
- file rmap lock (exclusively)
|
||||
2. hugetlb_unshare_all_pmds(), which I think is designed to be able to
|
||||
call us with only the mmap lock held (in shared mode), but currently
|
||||
only runs while holding mmap lock (exclusively) and VMA lock
|
||||
|
||||
Backporting note:
|
||||
This commit fixes a racy protection that was introduced in commit
|
||||
b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
|
||||
commit claimed to fix an issue introduced in 5.13, but it should actually
|
||||
also go all the way back.
|
||||
|
||||
[jannh@google.com: v2]
|
||||
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com
|
||||
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com
|
||||
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com
|
||||
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
|
||||
Signed-off-by: Jann Horn <jannh@google.com>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Reviewed-by: Oscar Salvador <osalvador@suse.de>
|
||||
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org> [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/hugetlb.h | 3 ++
|
||||
mm/hugetlb.c | 60 +++++++++++++++++++++++---------
|
||||
mm/vma.c | 7 ++++
|
||||
tools/testing/vma/vma_internal.h | 2 ++
|
||||
4 files changed, 56 insertions(+), 16 deletions(-)
|
||||
|
||||
--- a/include/linux/hugetlb.h
|
||||
+++ b/include/linux/hugetlb.h
|
||||
@@ -276,6 +276,7 @@ bool is_hugetlb_entry_migration(pte_t pt
|
||||
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
|
||||
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
|
||||
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
|
||||
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
|
||||
|
||||
#else /* !CONFIG_HUGETLB_PAGE */
|
||||
|
||||
@@ -473,6 +474,8 @@ static inline void fixup_hugetlb_reserva
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
|
||||
+
|
||||
#endif /* !CONFIG_HUGETLB_PAGE */
|
||||
|
||||
#ifndef pgd_write
|
||||
--- a/mm/hugetlb.c
|
||||
+++ b/mm/hugetlb.c
|
||||
@@ -120,7 +120,7 @@ static void hugetlb_vma_lock_free(struct
|
||||
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
|
||||
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
|
||||
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
|
||||
- unsigned long start, unsigned long end);
|
||||
+ unsigned long start, unsigned long end, bool take_locks);
|
||||
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
|
||||
|
||||
static void hugetlb_free_folio(struct folio *folio)
|
||||
@@ -5426,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm
|
||||
{
|
||||
if (addr & ~(huge_page_mask(hstate_vma(vma))))
|
||||
return -EINVAL;
|
||||
+ return 0;
|
||||
+}
|
||||
|
||||
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
|
||||
+{
|
||||
/*
|
||||
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
|
||||
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
|
||||
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
|
||||
+ * This function is called in the middle of a VMA split operation, with
|
||||
+ * MM, VMA and rmap all write-locked to prevent concurrent page table
|
||||
+ * walks (except hardware and gup_fast()).
|
||||
*/
|
||||
+ vma_assert_write_locked(vma);
|
||||
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
|
||||
+
|
||||
if (addr & ~PUD_MASK) {
|
||||
- /*
|
||||
- * hugetlb_vm_op_split is called right before we attempt to
|
||||
- * split the VMA. We will need to unshare PMDs in the old and
|
||||
- * new VMAs, so let's unshare before we split.
|
||||
- */
|
||||
unsigned long floor = addr & PUD_MASK;
|
||||
unsigned long ceil = floor + PUD_SIZE;
|
||||
|
||||
- if (floor >= vma->vm_start && ceil <= vma->vm_end)
|
||||
- hugetlb_unshare_pmds(vma, floor, ceil);
|
||||
+ if (floor >= vma->vm_start && ceil <= vma->vm_end) {
|
||||
+ /*
|
||||
+ * Locking:
|
||||
+ * Use take_locks=false here.
|
||||
+ * The file rmap lock is already held.
|
||||
+ * The hugetlb VMA lock can't be taken when we already
|
||||
+ * hold the file rmap lock, and we don't need it because
|
||||
+ * its purpose is to synchronize against concurrent page
|
||||
+ * table walks, which are not possible thanks to the
|
||||
+ * locks held by our caller.
|
||||
+ */
|
||||
+ hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
|
||||
+ }
|
||||
}
|
||||
-
|
||||
- return 0;
|
||||
}
|
||||
|
||||
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
|
||||
@@ -7884,9 +7898,16 @@ void move_hugetlb_state(struct folio *ol
|
||||
spin_unlock_irq(&hugetlb_lock);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * If @take_locks is false, the caller must ensure that no concurrent page table
|
||||
+ * access can happen (except for gup_fast() and hardware page walks).
|
||||
+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
|
||||
+ * concurrent page fault handling) and the file rmap lock.
|
||||
+ */
|
||||
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
|
||||
unsigned long start,
|
||||
- unsigned long end)
|
||||
+ unsigned long end,
|
||||
+ bool take_locks)
|
||||
{
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
unsigned long sz = huge_page_size(h);
|
||||
@@ -7910,8 +7931,12 @@ static void hugetlb_unshare_pmds(struct
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
|
||||
start, end);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
- hugetlb_vma_lock_write(vma);
|
||||
- i_mmap_lock_write(vma->vm_file->f_mapping);
|
||||
+ if (take_locks) {
|
||||
+ hugetlb_vma_lock_write(vma);
|
||||
+ i_mmap_lock_write(vma->vm_file->f_mapping);
|
||||
+ } else {
|
||||
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
|
||||
+ }
|
||||
for (address = start; address < end; address += PUD_SIZE) {
|
||||
ptep = hugetlb_walk(vma, address, sz);
|
||||
if (!ptep)
|
||||
@@ -7921,8 +7946,10 @@ static void hugetlb_unshare_pmds(struct
|
||||
spin_unlock(ptl);
|
||||
}
|
||||
flush_hugetlb_tlb_range(vma, start, end);
|
||||
- i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||
- hugetlb_vma_unlock_write(vma);
|
||||
+ if (take_locks) {
|
||||
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||
+ hugetlb_vma_unlock_write(vma);
|
||||
+ }
|
||||
/*
|
||||
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
|
||||
* Documentation/mm/mmu_notifier.rst.
|
||||
@@ -7937,7 +7964,8 @@ static void hugetlb_unshare_pmds(struct
|
||||
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
|
||||
{
|
||||
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
|
||||
- ALIGN_DOWN(vma->vm_end, PUD_SIZE));
|
||||
+ ALIGN_DOWN(vma->vm_end, PUD_SIZE),
|
||||
+ /* take_locks = */ true);
|
||||
}
|
||||
|
||||
/*
|
||||
--- a/mm/vma.c
|
||||
+++ b/mm/vma.c
|
||||
@@ -516,7 +516,14 @@ __split_vma(struct vma_iterator *vmi, st
|
||||
init_vma_prep(&vp, vma);
|
||||
vp.insert = new;
|
||||
vma_prepare(&vp);
|
||||
+
|
||||
+ /*
|
||||
+ * Get rid of huge pages and shared page tables straddling the split
|
||||
+ * boundary.
|
||||
+ */
|
||||
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
|
||||
+ if (is_vm_hugetlb_page(vma))
|
||||
+ hugetlb_split(vma, addr);
|
||||
|
||||
if (new_below) {
|
||||
vma->vm_start = addr;
|
||||
--- a/tools/testing/vma/vma_internal.h
|
||||
+++ b/tools/testing/vma/vma_internal.h
|
||||
@@ -793,6 +793,8 @@ static inline void vma_adjust_trans_huge
|
||||
(void)next;
|
||||
}
|
||||
|
||||
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
|
||||
+
|
||||
static inline void vma_iter_free(struct vma_iterator *vmi)
|
||||
{
|
||||
mas_destroy(&vmi->mas);
|
50
debian/patches/patchset-pf/fixes/0020-mm-hugetlb-fix-huge_pmd_unshare-vs-GUP-fast-race.patch
vendored
Normal file
50
debian/patches/patchset-pf/fixes/0020-mm-hugetlb-fix-huge_pmd_unshare-vs-GUP-fast-race.patch
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
From cbd0e47470ea4db11acf3612edf91b5047a90d24 Mon Sep 17 00:00:00 2001
|
||||
From: Jann Horn <jannh@google.com>
|
||||
Date: Tue, 27 May 2025 23:23:54 +0200
|
||||
Subject: mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race
|
||||
|
||||
huge_pmd_unshare() drops a reference on a page table that may have
|
||||
previously been shared across processes, potentially turning it into a
|
||||
normal page table used in another process in which unrelated VMAs can
|
||||
afterwards be installed.
|
||||
|
||||
If this happens in the middle of a concurrent gup_fast(), gup_fast() could
|
||||
end up walking the page tables of another process. While I don't see any
|
||||
way in which that immediately leads to kernel memory corruption, it is
|
||||
really weird and unexpected.
|
||||
|
||||
Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(),
|
||||
just like we do in khugepaged when removing page tables for a THP
|
||||
collapse.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-2-1329349bad1a@google.com
|
||||
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-2-f4136f5ec58a@google.com
|
||||
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
|
||||
Signed-off-by: Jann Horn <jannh@google.com>
|
||||
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Cc: Muchun Song <muchun.song@linux.dev>
|
||||
Cc: Oscar Salvador <osalvador@suse.de>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/hugetlb.c | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
--- a/mm/hugetlb.c
|
||||
+++ b/mm/hugetlb.c
|
||||
@@ -7628,6 +7628,13 @@ int huge_pmd_unshare(struct mm_struct *m
|
||||
return 0;
|
||||
|
||||
pud_clear(pud);
|
||||
+ /*
|
||||
+ * Once our caller drops the rmap lock, some other process might be
|
||||
+ * using this page table as a normal, non-hugetlb page table.
|
||||
+ * Wait for pending gup_fast() in other threads to finish before letting
|
||||
+ * that happen.
|
||||
+ */
|
||||
+ tlb_remove_table_sync_one();
|
||||
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
|
||||
mm_dec_nr_pmds(mm);
|
||||
return 1;
|
48
debian/patches/patchset-pf/fixes/0021-mm-madvise-handle-madvise_lock-failure-during-race-u.patch
vendored
Normal file
48
debian/patches/patchset-pf/fixes/0021-mm-madvise-handle-madvise_lock-failure-during-race-u.patch
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
From cb42e10062f07934d60ce2a9bc154ea7ac0bab5a Mon Sep 17 00:00:00 2001
|
||||
From: SeongJae Park <sj@kernel.org>
|
||||
Date: Mon, 2 Jun 2025 10:49:26 -0700
|
||||
Subject: mm/madvise: handle madvise_lock() failure during race unwinding
|
||||
|
||||
When unwinding race on -ERESTARTNOINTR handling of process_madvise(),
|
||||
madvise_lock() failure is ignored. Check the failure and abort remaining
|
||||
works in the case.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250602174926.1074-1-sj@kernel.org
|
||||
Fixes: 4000e3d0a367 ("mm/madvise: remove redundant mmap_lock operations from process_madvise()")
|
||||
Signed-off-by: SeongJae Park <sj@kernel.org>
|
||||
Reported-by: Barry Song <21cnbao@gmail.com>
|
||||
Closes: https://lore.kernel.org/CAGsJ_4xJXXO0G+4BizhohSZ4yDteziPw43_uF8nPXPWxUVChzw@mail.gmail.com
|
||||
Reviewed-by: Jann Horn <jannh@google.com>
|
||||
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Acked-by: David Hildenbrand <david@redhat.com>
|
||||
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/madvise.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/madvise.c
|
||||
+++ b/mm/madvise.c
|
||||
@@ -1830,7 +1830,9 @@ static ssize_t vector_madvise(struct mm_
|
||||
|
||||
/* Drop and reacquire lock to unwind race. */
|
||||
madvise_unlock(mm, behavior);
|
||||
- madvise_lock(mm, behavior);
|
||||
+ ret = madvise_lock(mm, behavior);
|
||||
+ if (ret)
|
||||
+ goto out;
|
||||
continue;
|
||||
}
|
||||
if (ret < 0)
|
||||
@@ -1839,6 +1841,7 @@ static ssize_t vector_madvise(struct mm_
|
||||
}
|
||||
madvise_unlock(mm, behavior);
|
||||
|
||||
+out:
|
||||
ret = (total_len - iov_iter_count(iter)) ? : ret;
|
||||
|
||||
return ret;
|
164
debian/patches/patchset-pf/fixes/0022-video-screen_info-Relocate-framebuffers-behind-PCI-b.patch
vendored
Normal file
164
debian/patches/patchset-pf/fixes/0022-video-screen_info-Relocate-framebuffers-behind-PCI-b.patch
vendored
Normal file
@@ -0,0 +1,164 @@
|
||||
From 0aeb6f83ff11709bb4b6fc9afa2f742681ca36e1 Mon Sep 17 00:00:00 2001
|
||||
From: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Date: Wed, 28 May 2025 10:02:08 +0200
|
||||
Subject: video: screen_info: Relocate framebuffers behind PCI bridges
|
||||
|
||||
Apply PCI host-bridge window offsets to screen_info framebuffers. Fixes
|
||||
invalid access to I/O memory.
|
||||
|
||||
Resources behind a PCI host bridge can be relocated by a certain offset
|
||||
in the kernel's CPU address range used for I/O. The framebuffer memory
|
||||
range stored in screen_info refers to the CPU addresses as seen during
|
||||
boot (where the offset is 0). During boot up, firmware may assign a
|
||||
different memory offset to the PCI host bridge and thereby relocating
|
||||
the framebuffer address of the PCI graphics device as seen by the kernel.
|
||||
The information in screen_info must be updated as well.
|
||||
|
||||
The helper pcibios_bus_to_resource() performs the relocation of the
|
||||
screen_info's framebuffer resource (given in PCI bus addresses). The
|
||||
result matches the I/O-memory resource of the PCI graphics device (given
|
||||
in CPU addresses). As before, we store away the information necessary to
|
||||
later update the information in screen_info itself.
|
||||
|
||||
Commit 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated
|
||||
EFI framebuffers") added the code for updating screen_info. It is based
|
||||
on similar functionality that pre-existed in efifb. Efifb uses a pointer
|
||||
to the PCI resource, while the newer code does a memcpy of the region.
|
||||
Hence efifb sees any updates to the PCI resource and avoids the issue.
|
||||
|
||||
v3:
|
||||
- Only use struct pci_bus_region for PCI bus addresses (Bjorn)
|
||||
- Clarify address semantics in commit messages and comments (Bjorn)
|
||||
v2:
|
||||
- Fixed tags (Takashi, Ivan)
|
||||
- Updated information on efifb
|
||||
|
||||
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
|
||||
Reported-by: "Ivan T. Ivanov" <iivanov@suse.de>
|
||||
Closes: https://bugzilla.suse.com/show_bug.cgi?id=1240696
|
||||
Tested-by: "Ivan T. Ivanov" <iivanov@suse.de>
|
||||
Fixes: 78aa89d1dfba ("firmware/sysfb: Update screen_info for relocated EFI framebuffers")
|
||||
Cc: dri-devel@lists.freedesktop.org
|
||||
Cc: <stable@vger.kernel.org> # v6.9+
|
||||
Link: https://lore.kernel.org/r/20250528080234.7380-1-tzimmermann@suse.de
|
||||
---
|
||||
drivers/video/screen_info_pci.c | 79 +++++++++++++++++++++------------
|
||||
1 file changed, 50 insertions(+), 29 deletions(-)
|
||||
|
||||
--- a/drivers/video/screen_info_pci.c
|
||||
+++ b/drivers/video/screen_info_pci.c
|
||||
@@ -7,8 +7,8 @@
|
||||
|
||||
static struct pci_dev *screen_info_lfb_pdev;
|
||||
static size_t screen_info_lfb_bar;
|
||||
-static resource_size_t screen_info_lfb_offset;
|
||||
-static struct resource screen_info_lfb_res = DEFINE_RES_MEM(0, 0);
|
||||
+static resource_size_t screen_info_lfb_res_start; // original start of resource
|
||||
+static resource_size_t screen_info_lfb_offset; // framebuffer offset within resource
|
||||
|
||||
static bool __screen_info_relocation_is_valid(const struct screen_info *si, struct resource *pr)
|
||||
{
|
||||
@@ -31,7 +31,7 @@ void screen_info_apply_fixups(void)
|
||||
if (screen_info_lfb_pdev) {
|
||||
struct resource *pr = &screen_info_lfb_pdev->resource[screen_info_lfb_bar];
|
||||
|
||||
- if (pr->start != screen_info_lfb_res.start) {
|
||||
+ if (pr->start != screen_info_lfb_res_start) {
|
||||
if (__screen_info_relocation_is_valid(si, pr)) {
|
||||
/*
|
||||
* Only update base if we have an actual
|
||||
@@ -47,46 +47,67 @@ void screen_info_apply_fixups(void)
|
||||
}
|
||||
}
|
||||
|
||||
+static int __screen_info_lfb_pci_bus_region(const struct screen_info *si, unsigned int type,
|
||||
+ struct pci_bus_region *r)
|
||||
+{
|
||||
+ u64 base, size;
|
||||
+
|
||||
+ base = __screen_info_lfb_base(si);
|
||||
+ if (!base)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ size = __screen_info_lfb_size(si, type);
|
||||
+ if (!size)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ r->start = base;
|
||||
+ r->end = base + size - 1;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static void screen_info_fixup_lfb(struct pci_dev *pdev)
|
||||
{
|
||||
unsigned int type;
|
||||
- struct resource res[SCREEN_INFO_MAX_RESOURCES];
|
||||
- size_t i, numres;
|
||||
+ struct pci_bus_region bus_region;
|
||||
int ret;
|
||||
+ struct resource r = {
|
||||
+ .flags = IORESOURCE_MEM,
|
||||
+ };
|
||||
+ const struct resource *pr;
|
||||
const struct screen_info *si = &screen_info;
|
||||
|
||||
if (screen_info_lfb_pdev)
|
||||
return; // already found
|
||||
|
||||
type = screen_info_video_type(si);
|
||||
- if (type != VIDEO_TYPE_EFI)
|
||||
- return; // only applies to EFI
|
||||
+ if (!__screen_info_has_lfb(type))
|
||||
+ return; // only applies to EFI; maybe VESA
|
||||
|
||||
- ret = screen_info_resources(si, res, ARRAY_SIZE(res));
|
||||
+ ret = __screen_info_lfb_pci_bus_region(si, type, &bus_region);
|
||||
if (ret < 0)
|
||||
return;
|
||||
- numres = ret;
|
||||
|
||||
- for (i = 0; i < numres; ++i) {
|
||||
- struct resource *r = &res[i];
|
||||
- const struct resource *pr;
|
||||
-
|
||||
- if (!(r->flags & IORESOURCE_MEM))
|
||||
- continue;
|
||||
- pr = pci_find_resource(pdev, r);
|
||||
- if (!pr)
|
||||
- continue;
|
||||
-
|
||||
- /*
|
||||
- * We've found a PCI device with the framebuffer
|
||||
- * resource. Store away the parameters to track
|
||||
- * relocation of the framebuffer aperture.
|
||||
- */
|
||||
- screen_info_lfb_pdev = pdev;
|
||||
- screen_info_lfb_bar = pr - pdev->resource;
|
||||
- screen_info_lfb_offset = r->start - pr->start;
|
||||
- memcpy(&screen_info_lfb_res, r, sizeof(screen_info_lfb_res));
|
||||
- }
|
||||
+ /*
|
||||
+ * Translate the PCI bus address to resource. Account
|
||||
+ * for an offset if the framebuffer is behind a PCI host
|
||||
+ * bridge.
|
||||
+ */
|
||||
+ pcibios_bus_to_resource(pdev->bus, &r, &bus_region);
|
||||
+
|
||||
+ pr = pci_find_resource(pdev, &r);
|
||||
+ if (!pr)
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * We've found a PCI device with the framebuffer
|
||||
+ * resource. Store away the parameters to track
|
||||
+ * relocation of the framebuffer aperture.
|
||||
+ */
|
||||
+ screen_info_lfb_pdev = pdev;
|
||||
+ screen_info_lfb_bar = pr - pdev->resource;
|
||||
+ screen_info_lfb_offset = r.start - pr->start;
|
||||
+ screen_info_lfb_res_start = bus_region.start;
|
||||
}
|
||||
DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID, PCI_BASE_CLASS_DISPLAY, 16,
|
||||
screen_info_fixup_lfb);
|
86
debian/patches/patchset-pf/fixes/0023-sysfb-Fix-screen_info-type-check-for-VGA.patch
vendored
Normal file
86
debian/patches/patchset-pf/fixes/0023-sysfb-Fix-screen_info-type-check-for-VGA.patch
vendored
Normal file
@@ -0,0 +1,86 @@
|
||||
From 06ff725d11ea8713876187973c834fb595cb26f1 Mon Sep 17 00:00:00 2001
|
||||
From: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Date: Tue, 3 Jun 2025 17:48:20 +0200
|
||||
Subject: sysfb: Fix screen_info type check for VGA
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Use the helper screen_info_video_type() to get the framebuffer
|
||||
type from struct screen_info. Handle supported values in sorted
|
||||
switch statement.
|
||||
|
||||
Reading orig_video_isVGA is unreliable. On most systems it is a
|
||||
VIDEO_TYPE_ constant. On some systems with VGA it is simply set
|
||||
to 1 to signal the presence of a VGA output. See vga_probe() for
|
||||
an example. Retrieving the screen_info type with the helper
|
||||
screen_info_video_type() detects these cases and returns the
|
||||
appropriate VIDEO_TYPE_ constant. For VGA, sysfb creates a device
|
||||
named "vga-framebuffer".
|
||||
|
||||
The sysfb code has been taken from vga16fb, where it likely didn't
|
||||
work correctly either. With this bugfix applied, vga16fb loads for
|
||||
compatible vga-framebuffer devices.
|
||||
|
||||
Fixes: 0db5b61e0dc0 ("fbdev/vga16fb: Create EGA/VGA devices in sysfb code")
|
||||
Cc: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Cc: Javier Martinez Canillas <javierm@redhat.com>
|
||||
Cc: Alex Deucher <alexander.deucher@amd.com>
|
||||
Cc: Tzung-Bi Shih <tzungbi@kernel.org>
|
||||
Cc: Helge Deller <deller@gmx.de>
|
||||
Cc: "Uwe Kleine-König" <u.kleine-koenig@baylibre.com>
|
||||
Cc: Zsolt Kajtar <soci@c64.rulez.org>
|
||||
Cc: <stable@vger.kernel.org> # v6.1+
|
||||
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
|
||||
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
|
||||
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
|
||||
Link: https://lore.kernel.org/r/20250603154838.401882-1-tzimmermann@suse.de
|
||||
---
|
||||
drivers/firmware/sysfb.c | 26 ++++++++++++++++++--------
|
||||
1 file changed, 18 insertions(+), 8 deletions(-)
|
||||
|
||||
--- a/drivers/firmware/sysfb.c
|
||||
+++ b/drivers/firmware/sysfb.c
|
||||
@@ -143,6 +143,7 @@ static __init int sysfb_init(void)
|
||||
{
|
||||
struct screen_info *si = &screen_info;
|
||||
struct device *parent;
|
||||
+ unsigned int type;
|
||||
struct simplefb_platform_data mode;
|
||||
const char *name;
|
||||
bool compatible;
|
||||
@@ -170,17 +171,26 @@ static __init int sysfb_init(void)
|
||||
goto put_device;
|
||||
}
|
||||
|
||||
+ type = screen_info_video_type(si);
|
||||
+
|
||||
/* if the FB is incompatible, create a legacy framebuffer device */
|
||||
- if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
|
||||
- name = "efi-framebuffer";
|
||||
- else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
|
||||
- name = "vesa-framebuffer";
|
||||
- else if (si->orig_video_isVGA == VIDEO_TYPE_VGAC)
|
||||
- name = "vga-framebuffer";
|
||||
- else if (si->orig_video_isVGA == VIDEO_TYPE_EGAC)
|
||||
+ switch (type) {
|
||||
+ case VIDEO_TYPE_EGAC:
|
||||
name = "ega-framebuffer";
|
||||
- else
|
||||
+ break;
|
||||
+ case VIDEO_TYPE_VGAC:
|
||||
+ name = "vga-framebuffer";
|
||||
+ break;
|
||||
+ case VIDEO_TYPE_VLFB:
|
||||
+ name = "vesa-framebuffer";
|
||||
+ break;
|
||||
+ case VIDEO_TYPE_EFI:
|
||||
+ name = "efi-framebuffer";
|
||||
+ break;
|
||||
+ default:
|
||||
name = "platform-framebuffer";
|
||||
+ break;
|
||||
+ }
|
||||
|
||||
pd = platform_device_alloc(name, 0);
|
||||
if (!pd) {
|
113
debian/patches/patchset-pf/fixes/0024-x86-iopl-Cure-TIF_IO_BITMAP-inconsistencies.patch
vendored
Normal file
113
debian/patches/patchset-pf/fixes/0024-x86-iopl-Cure-TIF_IO_BITMAP-inconsistencies.patch
vendored
Normal file
@@ -0,0 +1,113 @@
|
||||
From ba4c83076943b477c90015581cc88e262a7d772f Mon Sep 17 00:00:00 2001
|
||||
From: Thomas Gleixner <tglx@linutronix.de>
|
||||
Date: Wed, 26 Feb 2025 16:01:57 +0100
|
||||
Subject: x86/iopl: Cure TIF_IO_BITMAP inconsistencies
|
||||
|
||||
io_bitmap_exit() is invoked from exit_thread() when a task exists or
|
||||
when a fork fails. In the latter case the exit_thread() cleans up
|
||||
resources which were allocated during fork().
|
||||
|
||||
io_bitmap_exit() invokes task_update_io_bitmap(), which in turn ends up
|
||||
in tss_update_io_bitmap(). tss_update_io_bitmap() operates on the
|
||||
current task. If current has TIF_IO_BITMAP set, but no bitmap installed,
|
||||
tss_update_io_bitmap() crashes with a NULL pointer dereference.
|
||||
|
||||
There are two issues, which lead to that problem:
|
||||
|
||||
1) io_bitmap_exit() should not invoke task_update_io_bitmap() when
|
||||
the task, which is cleaned up, is not the current task. That's a
|
||||
clear indicator for a cleanup after a failed fork().
|
||||
|
||||
2) A task should not have TIF_IO_BITMAP set and neither a bitmap
|
||||
installed nor IOPL emulation level 3 activated.
|
||||
|
||||
This happens when a kernel thread is created in the context of
|
||||
a user space thread, which has TIF_IO_BITMAP set as the thread
|
||||
flags are copied and the IO bitmap pointer is cleared.
|
||||
|
||||
Other than in the failed fork() case this has no impact because
|
||||
kernel threads including IO workers never return to user space and
|
||||
therefore never invoke tss_update_io_bitmap().
|
||||
|
||||
Cure this by adding the missing cleanups and checks:
|
||||
|
||||
1) Prevent io_bitmap_exit() to invoke task_update_io_bitmap() if
|
||||
the to be cleaned up task is not the current task.
|
||||
|
||||
2) Clear TIF_IO_BITMAP in copy_thread() unconditionally. For user
|
||||
space forks it is set later, when the IO bitmap is inherited in
|
||||
io_bitmap_share().
|
||||
|
||||
For paranoia sake, add a warning into tss_update_io_bitmap() to catch
|
||||
the case, when that code is invoked with inconsistent state.
|
||||
|
||||
Fixes: ea5f1cd7ab49 ("x86/ioperm: Remove bitmap if all permissions dropped")
|
||||
Reported-by: syzbot+e2b1803445d236442e54@syzkaller.appspotmail.com
|
||||
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lore.kernel.org/87wmdceom2.ffs@tglx
|
||||
---
|
||||
arch/x86/kernel/ioport.c | 13 +++++++++----
|
||||
arch/x86/kernel/process.c | 6 ++++++
|
||||
2 files changed, 15 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/arch/x86/kernel/ioport.c
|
||||
+++ b/arch/x86/kernel/ioport.c
|
||||
@@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct
|
||||
set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
|
||||
}
|
||||
|
||||
-static void task_update_io_bitmap(struct task_struct *tsk)
|
||||
+static void task_update_io_bitmap(void)
|
||||
{
|
||||
+ struct task_struct *tsk = current;
|
||||
struct thread_struct *t = &tsk->thread;
|
||||
|
||||
if (t->iopl_emul == 3 || t->io_bitmap) {
|
||||
@@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *
|
||||
struct io_bitmap *iobm = tsk->thread.io_bitmap;
|
||||
|
||||
tsk->thread.io_bitmap = NULL;
|
||||
- task_update_io_bitmap(tsk);
|
||||
+ /*
|
||||
+ * Don't touch the TSS when invoked on a failed fork(). TSS
|
||||
+ * reflects the state of @current and not the state of @tsk.
|
||||
+ */
|
||||
+ if (tsk == current)
|
||||
+ task_update_io_bitmap();
|
||||
if (iobm && refcount_dec_and_test(&iobm->refcnt))
|
||||
kfree(iobm);
|
||||
}
|
||||
@@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, leve
|
||||
}
|
||||
|
||||
t->iopl_emul = level;
|
||||
- task_update_io_bitmap(current);
|
||||
-
|
||||
+ task_update_io_bitmap();
|
||||
return 0;
|
||||
}
|
||||
|
||||
--- a/arch/x86/kernel/process.c
|
||||
+++ b/arch/x86/kernel/process.c
|
||||
@@ -181,6 +181,7 @@ int copy_thread(struct task_struct *p, c
|
||||
frame->ret_addr = (unsigned long) ret_from_fork_asm;
|
||||
p->thread.sp = (unsigned long) fork_frame;
|
||||
p->thread.io_bitmap = NULL;
|
||||
+ clear_tsk_thread_flag(p, TIF_IO_BITMAP);
|
||||
p->thread.iopl_warn = 0;
|
||||
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
|
||||
|
||||
@@ -469,6 +470,11 @@ void native_tss_update_io_bitmap(void)
|
||||
} else {
|
||||
struct io_bitmap *iobm = t->io_bitmap;
|
||||
|
||||
+ if (WARN_ON_ONCE(!iobm)) {
|
||||
+ clear_thread_flag(TIF_IO_BITMAP);
|
||||
+ native_tss_invalidate_io_bitmap();
|
||||
+ }
|
||||
+
|
||||
/*
|
||||
* Only copy bitmap data when the sequence number differs. The
|
||||
* update time is accounted to the incoming task.
|
200
debian/patches/patchset-pf/fixes/0025-watchdog-fix-watchdog-may-detect-false-positive-of-s.patch
vendored
Normal file
200
debian/patches/patchset-pf/fixes/0025-watchdog-fix-watchdog-may-detect-false-positive-of-s.patch
vendored
Normal file
@@ -0,0 +1,200 @@
|
||||
From 7856e6900a09ed537366a5e0c774be8926ee022e Mon Sep 17 00:00:00 2001
|
||||
From: Luo Gengkun <luogengkun@huaweicloud.com>
|
||||
Date: Mon, 21 Apr 2025 03:50:21 +0000
|
||||
Subject: watchdog: fix watchdog may detect false positive of softlockup
|
||||
|
||||
When updating `watchdog_thresh`, there is a race condition between writing
|
||||
the new `watchdog_thresh` value and stopping the old watchdog timer. If
|
||||
the old timer triggers during this window, it may falsely detect a
|
||||
softlockup due to the old interval and the new `watchdog_thresh` value
|
||||
being used. The problem can be described as follow:
|
||||
|
||||
# We asuume previous watchdog_thresh is 60, so the watchdog timer is
|
||||
# coming every 24s.
|
||||
echo 10 > /proc/sys/kernel/watchdog_thresh (User space)
|
||||
|
|
||||
+------>+ update watchdog_thresh (We are in kernel now)
|
||||
|
|
||||
| # using old interval and new `watchdog_thresh`
|
||||
+------>+ watchdog hrtimer (irq context: detect softlockup)
|
||||
|
|
||||
|
|
||||
+-------+
|
||||
|
|
||||
|
|
||||
+ softlockup_stop_all
|
||||
|
||||
To fix this problem, introduce a shadow variable for `watchdog_thresh`.
|
||||
The update to the actual `watchdog_thresh` is delayed until after the old
|
||||
timer is stopped, preventing false positives.
|
||||
|
||||
The following testcase may help to understand this problem.
|
||||
|
||||
---------------------------------------------
|
||||
echo RT_RUNTIME_SHARE > /sys/kernel/debug/sched/features
|
||||
echo -1 > /proc/sys/kernel/sched_rt_runtime_us
|
||||
echo 0 > /sys/kernel/debug/sched/fair_server/cpu3/runtime
|
||||
echo 60 > /proc/sys/kernel/watchdog_thresh
|
||||
taskset -c 3 chrt -r 99 /bin/bash -c "while true;do true; done" &
|
||||
echo 10 > /proc/sys/kernel/watchdog_thresh &
|
||||
---------------------------------------------
|
||||
|
||||
The test case above first removes the throttling restrictions for
|
||||
real-time tasks. It then sets watchdog_thresh to 60 and executes a
|
||||
real-time task ,a simple while(1) loop, on cpu3. Consequently, the final
|
||||
command gets blocked because the presence of this real-time thread
|
||||
prevents kworker:3 from being selected by the scheduler. This eventually
|
||||
triggers a softlockup detection on cpu3 due to watchdog_timer_fn operating
|
||||
with inconsistent variable - using both the old interval and the updated
|
||||
watchdog_thresh simultaneously.
|
||||
|
||||
[nysal@linux.ibm.com: fix the SOFTLOCKUP_DETECTOR=n case]
|
||||
Link: https://lkml.kernel.org/r/20250502111120.282690-1-nysal@linux.ibm.com
|
||||
Link: https://lkml.kernel.org/r/20250421035021.3507649-1-luogengkun@huaweicloud.com
|
||||
Signed-off-by: Luo Gengkun <luogengkun@huaweicloud.com>
|
||||
Signed-off-by: Nysal Jan K.A. <nysal@linux.ibm.com>
|
||||
Cc: Doug Anderson <dianders@chromium.org>
|
||||
Cc: Joel Granados <joel.granados@kernel.org>
|
||||
Cc: Song Liu <song@kernel.org>
|
||||
Cc: Thomas Gleinxer <tglx@linutronix.de>
|
||||
Cc: "Nysal Jan K.A." <nysal@linux.ibm.com>
|
||||
Cc: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
kernel/watchdog.c | 41 +++++++++++++++++++++++++++--------------
|
||||
1 file changed, 27 insertions(+), 14 deletions(-)
|
||||
|
||||
--- a/kernel/watchdog.c
|
||||
+++ b/kernel/watchdog.c
|
||||
@@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled
|
||||
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
|
||||
static int __read_mostly watchdog_softlockup_user_enabled = 1;
|
||||
int __read_mostly watchdog_thresh = 10;
|
||||
+static int __read_mostly watchdog_thresh_next;
|
||||
static int __read_mostly watchdog_hardlockup_available;
|
||||
|
||||
struct cpumask watchdog_cpumask __read_mostly;
|
||||
@@ -870,12 +871,20 @@ int lockup_detector_offline_cpu(unsigned
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static void __lockup_detector_reconfigure(void)
|
||||
+static void __lockup_detector_reconfigure(bool thresh_changed)
|
||||
{
|
||||
cpus_read_lock();
|
||||
watchdog_hardlockup_stop();
|
||||
|
||||
softlockup_stop_all();
|
||||
+ /*
|
||||
+ * To prevent watchdog_timer_fn from using the old interval and
|
||||
+ * the new watchdog_thresh at the same time, which could lead to
|
||||
+ * false softlockup reports, it is necessary to update the
|
||||
+ * watchdog_thresh after the softlockup is completed.
|
||||
+ */
|
||||
+ if (thresh_changed)
|
||||
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
|
||||
set_sample_period();
|
||||
lockup_detector_update_enable();
|
||||
if (watchdog_enabled && watchdog_thresh)
|
||||
@@ -888,7 +897,7 @@ static void __lockup_detector_reconfigur
|
||||
void lockup_detector_reconfigure(void)
|
||||
{
|
||||
mutex_lock(&watchdog_mutex);
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(false);
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
}
|
||||
|
||||
@@ -908,27 +917,29 @@ static __init void lockup_detector_setup
|
||||
return;
|
||||
|
||||
mutex_lock(&watchdog_mutex);
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(false);
|
||||
softlockup_initialized = true;
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
}
|
||||
|
||||
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
|
||||
-static void __lockup_detector_reconfigure(void)
|
||||
+static void __lockup_detector_reconfigure(bool thresh_changed)
|
||||
{
|
||||
cpus_read_lock();
|
||||
watchdog_hardlockup_stop();
|
||||
+ if (thresh_changed)
|
||||
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
|
||||
lockup_detector_update_enable();
|
||||
watchdog_hardlockup_start();
|
||||
cpus_read_unlock();
|
||||
}
|
||||
void lockup_detector_reconfigure(void)
|
||||
{
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(false);
|
||||
}
|
||||
static inline void lockup_detector_setup(void)
|
||||
{
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(false);
|
||||
}
|
||||
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
|
||||
|
||||
@@ -946,11 +957,11 @@ void lockup_detector_soft_poweroff(void)
|
||||
#ifdef CONFIG_SYSCTL
|
||||
|
||||
/* Propagate any changes to the watchdog infrastructure */
|
||||
-static void proc_watchdog_update(void)
|
||||
+static void proc_watchdog_update(bool thresh_changed)
|
||||
{
|
||||
/* Remove impossible cpus to keep sysctl output clean. */
|
||||
cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
|
||||
- __lockup_detector_reconfigure();
|
||||
+ __lockup_detector_reconfigure(thresh_changed);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -984,7 +995,7 @@ static int proc_watchdog_common(int whic
|
||||
} else {
|
||||
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (!err && old != READ_ONCE(*param))
|
||||
- proc_watchdog_update();
|
||||
+ proc_watchdog_update(false);
|
||||
}
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
return err;
|
||||
@@ -1035,11 +1046,13 @@ static int proc_watchdog_thresh(const st
|
||||
|
||||
mutex_lock(&watchdog_mutex);
|
||||
|
||||
- old = READ_ONCE(watchdog_thresh);
|
||||
+ watchdog_thresh_next = READ_ONCE(watchdog_thresh);
|
||||
+
|
||||
+ old = watchdog_thresh_next;
|
||||
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
|
||||
- if (!err && write && old != READ_ONCE(watchdog_thresh))
|
||||
- proc_watchdog_update();
|
||||
+ if (!err && write && old != READ_ONCE(watchdog_thresh_next))
|
||||
+ proc_watchdog_update(true);
|
||||
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
return err;
|
||||
@@ -1060,7 +1073,7 @@ static int proc_watchdog_cpumask(const s
|
||||
|
||||
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
|
||||
if (!err && write)
|
||||
- proc_watchdog_update();
|
||||
+ proc_watchdog_update(false);
|
||||
|
||||
mutex_unlock(&watchdog_mutex);
|
||||
return err;
|
||||
@@ -1080,7 +1093,7 @@ static const struct ctl_table watchdog_s
|
||||
},
|
||||
{
|
||||
.procname = "watchdog_thresh",
|
||||
- .data = &watchdog_thresh,
|
||||
+ .data = &watchdog_thresh_next,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_watchdog_thresh,
|
288
debian/patches/patchset-pf/fixes/0026-sched-rt-Fix-race-in-push_rt_task.patch
vendored
Normal file
288
debian/patches/patchset-pf/fixes/0026-sched-rt-Fix-race-in-push_rt_task.patch
vendored
Normal file
@@ -0,0 +1,288 @@
|
||||
From 45c6602b7fa2a9dfd05a1f9289504c2437205ce4 Mon Sep 17 00:00:00 2001
|
||||
From: Harshit Agarwal <harshit@nutanix.com>
|
||||
Date: Tue, 25 Feb 2025 18:05:53 +0000
|
||||
Subject: sched/rt: Fix race in push_rt_task
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Overview
|
||||
========
|
||||
When a CPU chooses to call push_rt_task and picks a task to push to
|
||||
another CPU's runqueue then it will call find_lock_lowest_rq method
|
||||
which would take a double lock on both CPUs' runqueues. If one of the
|
||||
locks aren't readily available, it may lead to dropping the current
|
||||
runqueue lock and reacquiring both the locks at once. During this window
|
||||
it is possible that the task is already migrated and is running on some
|
||||
other CPU. These cases are already handled. However, if the task is
|
||||
migrated and has already been executed and another CPU is now trying to
|
||||
wake it up (ttwu) such that it is queued again on the runqeue
|
||||
(on_rq is 1) and also if the task was run by the same CPU, then the
|
||||
current checks will pass even though the task was migrated out and is no
|
||||
longer in the pushable tasks list.
|
||||
|
||||
Crashes
|
||||
=======
|
||||
This bug resulted in quite a few flavors of crashes triggering kernel
|
||||
panics with various crash signatures such as assert failures, page
|
||||
faults, null pointer dereferences, and queue corruption errors all
|
||||
coming from scheduler itself.
|
||||
|
||||
Some of the crashes:
|
||||
-> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO)
|
||||
Call Trace:
|
||||
? __die_body+0x1a/0x60
|
||||
? die+0x2a/0x50
|
||||
? do_trap+0x85/0x100
|
||||
? pick_next_task_rt+0x6e/0x1d0
|
||||
? do_error_trap+0x64/0xa0
|
||||
? pick_next_task_rt+0x6e/0x1d0
|
||||
? exc_invalid_op+0x4c/0x60
|
||||
? pick_next_task_rt+0x6e/0x1d0
|
||||
? asm_exc_invalid_op+0x12/0x20
|
||||
? pick_next_task_rt+0x6e/0x1d0
|
||||
__schedule+0x5cb/0x790
|
||||
? update_ts_time_stats+0x55/0x70
|
||||
schedule_idle+0x1e/0x40
|
||||
do_idle+0x15e/0x200
|
||||
cpu_startup_entry+0x19/0x20
|
||||
start_secondary+0x117/0x160
|
||||
secondary_startup_64_no_verify+0xb0/0xbb
|
||||
|
||||
-> BUG: kernel NULL pointer dereference, address: 00000000000000c0
|
||||
Call Trace:
|
||||
? __die_body+0x1a/0x60
|
||||
? no_context+0x183/0x350
|
||||
? __warn+0x8a/0xe0
|
||||
? exc_page_fault+0x3d6/0x520
|
||||
? asm_exc_page_fault+0x1e/0x30
|
||||
? pick_next_task_rt+0xb5/0x1d0
|
||||
? pick_next_task_rt+0x8c/0x1d0
|
||||
__schedule+0x583/0x7e0
|
||||
? update_ts_time_stats+0x55/0x70
|
||||
schedule_idle+0x1e/0x40
|
||||
do_idle+0x15e/0x200
|
||||
cpu_startup_entry+0x19/0x20
|
||||
start_secondary+0x117/0x160
|
||||
secondary_startup_64_no_verify+0xb0/0xbb
|
||||
|
||||
-> BUG: unable to handle page fault for address: ffff9464daea5900
|
||||
kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p))
|
||||
|
||||
-> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running)
|
||||
Call Trace:
|
||||
? __die_body+0x1a/0x60
|
||||
? die+0x2a/0x50
|
||||
? do_trap+0x85/0x100
|
||||
? dequeue_top_rt_rq+0xa2/0xb0
|
||||
? do_error_trap+0x64/0xa0
|
||||
? dequeue_top_rt_rq+0xa2/0xb0
|
||||
? exc_invalid_op+0x4c/0x60
|
||||
? dequeue_top_rt_rq+0xa2/0xb0
|
||||
? asm_exc_invalid_op+0x12/0x20
|
||||
? dequeue_top_rt_rq+0xa2/0xb0
|
||||
dequeue_rt_entity+0x1f/0x70
|
||||
dequeue_task_rt+0x2d/0x70
|
||||
__schedule+0x1a8/0x7e0
|
||||
? blk_finish_plug+0x25/0x40
|
||||
schedule+0x3c/0xb0
|
||||
futex_wait_queue_me+0xb6/0x120
|
||||
futex_wait+0xd9/0x240
|
||||
do_futex+0x344/0xa90
|
||||
? get_mm_exe_file+0x30/0x60
|
||||
? audit_exe_compare+0x58/0x70
|
||||
? audit_filter_rules.constprop.26+0x65e/0x1220
|
||||
__x64_sys_futex+0x148/0x1f0
|
||||
do_syscall_64+0x30/0x80
|
||||
entry_SYSCALL_64_after_hwframe+0x62/0xc7
|
||||
|
||||
-> BUG: unable to handle page fault for address: ffff8cf3608bc2c0
|
||||
Call Trace:
|
||||
? __die_body+0x1a/0x60
|
||||
? no_context+0x183/0x350
|
||||
? spurious_kernel_fault+0x171/0x1c0
|
||||
? exc_page_fault+0x3b6/0x520
|
||||
? plist_check_list+0x15/0x40
|
||||
? plist_check_list+0x2e/0x40
|
||||
? asm_exc_page_fault+0x1e/0x30
|
||||
? _cond_resched+0x15/0x30
|
||||
? futex_wait_queue_me+0xc8/0x120
|
||||
? futex_wait+0xd9/0x240
|
||||
? try_to_wake_up+0x1b8/0x490
|
||||
? futex_wake+0x78/0x160
|
||||
? do_futex+0xcd/0xa90
|
||||
? plist_check_list+0x15/0x40
|
||||
? plist_check_list+0x2e/0x40
|
||||
? plist_del+0x6a/0xd0
|
||||
? plist_check_list+0x15/0x40
|
||||
? plist_check_list+0x2e/0x40
|
||||
? dequeue_pushable_task+0x20/0x70
|
||||
? __schedule+0x382/0x7e0
|
||||
? asm_sysvec_reschedule_ipi+0xa/0x20
|
||||
? schedule+0x3c/0xb0
|
||||
? exit_to_user_mode_prepare+0x9e/0x150
|
||||
? irqentry_exit_to_user_mode+0x5/0x30
|
||||
? asm_sysvec_reschedule_ipi+0x12/0x20
|
||||
|
||||
Above are some of the common examples of the crashes that were observed
|
||||
due to this issue.
|
||||
|
||||
Details
|
||||
=======
|
||||
Let's look at the following scenario to understand this race.
|
||||
|
||||
1) CPU A enters push_rt_task
|
||||
a) CPU A has chosen next_task = task p.
|
||||
b) CPU A calls find_lock_lowest_rq(Task p, CPU Z’s rq).
|
||||
c) CPU A identifies CPU X as a destination CPU (X < Z).
|
||||
d) CPU A enters double_lock_balance(CPU Z’s rq, CPU X’s rq).
|
||||
e) Since X is lower than Z, CPU A unlocks CPU Z’s rq. Someone else has
|
||||
locked CPU X’s rq, and thus, CPU A must wait.
|
||||
|
||||
2) At CPU Z
|
||||
a) Previous task has completed execution and thus, CPU Z enters
|
||||
schedule, locks its own rq after CPU A releases it.
|
||||
b) CPU Z dequeues previous task and begins executing task p.
|
||||
c) CPU Z unlocks its rq.
|
||||
d) Task p yields the CPU (ex. by doing IO or waiting to acquire a
|
||||
lock) which triggers the schedule function on CPU Z.
|
||||
e) CPU Z enters schedule again, locks its own rq, and dequeues task p.
|
||||
f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq.
|
||||
|
||||
3) At CPU B
|
||||
a) CPU B enters try_to_wake_up with input task p.
|
||||
b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates
|
||||
B.state = WAKING.
|
||||
c) CPU B via select_task_rq determines CPU Y as the target CPU.
|
||||
|
||||
4) The race
|
||||
a) CPU A acquires CPU X’s lock and relocks CPU Z.
|
||||
b) CPU A reads task p.cpu = Z and incorrectly concludes task p is
|
||||
still on CPU Z.
|
||||
c) CPU A failed to notice task p had been dequeued from CPU Z while
|
||||
CPU A was waiting for locks in double_lock_balance. If CPU A knew
|
||||
that task p had been dequeued, it would return NULL forcing
|
||||
push_rt_task to give up the task p's migration.
|
||||
d) CPU B updates task p.cpu = Y and calls ttwu_queue.
|
||||
e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task
|
||||
p.on_rq = 1.
|
||||
f) CPU B unlocks CPU Y, triggering memory synchronization.
|
||||
g) CPU A reads task p.on_rq = 1, cementing its assumption that task p
|
||||
has not migrated.
|
||||
h) CPU A decides to migrate p to CPU X.
|
||||
|
||||
This leads to A dequeuing p from Y's queue and various crashes down the
|
||||
line.
|
||||
|
||||
Solution
|
||||
========
|
||||
The solution here is fairly simple. After obtaining the lock (at 4a),
|
||||
the check is enhanced to make sure that the task is still at the head of
|
||||
the pushable tasks list. If not, then it is anyway not suitable for
|
||||
being pushed out.
|
||||
|
||||
Testing
|
||||
=======
|
||||
The fix is tested on a cluster of 3 nodes, where the panics due to this
|
||||
are hit every couple of days. A fix similar to this was deployed on such
|
||||
cluster and was stable for more than 30 days.
|
||||
|
||||
Co-developed-by: Jon Kohler <jon@nutanix.com>
|
||||
Signed-off-by: Jon Kohler <jon@nutanix.com>
|
||||
Co-developed-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
|
||||
Signed-off-by: Gauri Patwardhan <gauri.patwardhan@nutanix.com>
|
||||
Co-developed-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
|
||||
Signed-off-by: Rahul Chunduru <rahul.chunduru@nutanix.com>
|
||||
Signed-off-by: Harshit Agarwal <harshit@nutanix.com>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Reviewed-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
|
||||
Reviewed-by: Phil Auld <pauld@redhat.com>
|
||||
Tested-by: Will Ton <william.ton@nutanix.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lore.kernel.org/r/20250225180553.167995-1-harshit@nutanix.com
|
||||
---
|
||||
kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------
|
||||
1 file changed, 26 insertions(+), 28 deletions(-)
|
||||
|
||||
--- a/kernel/sched/rt.c
|
||||
+++ b/kernel/sched/rt.c
|
||||
@@ -1883,6 +1883,27 @@ static int find_lowest_rq(struct task_st
|
||||
return -1;
|
||||
}
|
||||
|
||||
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
|
||||
+{
|
||||
+ struct task_struct *p;
|
||||
+
|
||||
+ if (!has_pushable_tasks(rq))
|
||||
+ return NULL;
|
||||
+
|
||||
+ p = plist_first_entry(&rq->rt.pushable_tasks,
|
||||
+ struct task_struct, pushable_tasks);
|
||||
+
|
||||
+ BUG_ON(rq->cpu != task_cpu(p));
|
||||
+ BUG_ON(task_current(rq, p));
|
||||
+ BUG_ON(task_current_donor(rq, p));
|
||||
+ BUG_ON(p->nr_cpus_allowed <= 1);
|
||||
+
|
||||
+ BUG_ON(!task_on_rq_queued(p));
|
||||
+ BUG_ON(!rt_task(p));
|
||||
+
|
||||
+ return p;
|
||||
+}
|
||||
+
|
||||
/* Will lock the rq it finds */
|
||||
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
|
||||
{
|
||||
@@ -1913,18 +1934,16 @@ static struct rq *find_lock_lowest_rq(st
|
||||
/*
|
||||
* We had to unlock the run queue. In
|
||||
* the mean time, task could have
|
||||
- * migrated already or had its affinity changed.
|
||||
- * Also make sure that it wasn't scheduled on its rq.
|
||||
+ * migrated already or had its affinity changed,
|
||||
+ * therefore check if the task is still at the
|
||||
+ * head of the pushable tasks list.
|
||||
* It is possible the task was scheduled, set
|
||||
* "migrate_disabled" and then got preempted, so we must
|
||||
* check the task migration disable flag here too.
|
||||
*/
|
||||
- if (unlikely(task_rq(task) != rq ||
|
||||
+ if (unlikely(is_migration_disabled(task) ||
|
||||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
|
||||
- task_on_cpu(rq, task) ||
|
||||
- !rt_task(task) ||
|
||||
- is_migration_disabled(task) ||
|
||||
- !task_on_rq_queued(task))) {
|
||||
+ task != pick_next_pushable_task(rq))) {
|
||||
|
||||
double_unlock_balance(rq, lowest_rq);
|
||||
lowest_rq = NULL;
|
||||
@@ -1944,27 +1963,6 @@ static struct rq *find_lock_lowest_rq(st
|
||||
return lowest_rq;
|
||||
}
|
||||
|
||||
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
|
||||
-{
|
||||
- struct task_struct *p;
|
||||
-
|
||||
- if (!has_pushable_tasks(rq))
|
||||
- return NULL;
|
||||
-
|
||||
- p = plist_first_entry(&rq->rt.pushable_tasks,
|
||||
- struct task_struct, pushable_tasks);
|
||||
-
|
||||
- BUG_ON(rq->cpu != task_cpu(p));
|
||||
- BUG_ON(task_current(rq, p));
|
||||
- BUG_ON(task_current_donor(rq, p));
|
||||
- BUG_ON(p->nr_cpus_allowed <= 1);
|
||||
-
|
||||
- BUG_ON(!task_on_rq_queued(p));
|
||||
- BUG_ON(!rt_task(p));
|
||||
-
|
||||
- return p;
|
||||
-}
|
||||
-
|
||||
/*
|
||||
* If the current CPU has more than one RT task, see if the non
|
||||
* running task can migrate over to a CPU that is running a task
|
62
debian/patches/patchset-pf/fixes/0027-sched-fair-Adhere-to-place_entity-constraints.patch
vendored
Normal file
62
debian/patches/patchset-pf/fixes/0027-sched-fair-Adhere-to-place_entity-constraints.patch
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
From 14b4658d3fa78b169f36e62e722a076a7c50afd8 Mon Sep 17 00:00:00 2001
|
||||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Tue, 28 Jan 2025 15:39:49 +0100
|
||||
Subject: sched/fair: Adhere to place_entity() constraints
|
||||
|
||||
Mike reports that commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
|
||||
placement bug causing scheduling lag") relies on commit 4423af84b297
|
||||
("sched/fair: optimize the PLACE_LAG when se->vlag is zero") to not
|
||||
trip a WARN in place_entity().
|
||||
|
||||
What happens is that the lag of the very last entity is 0 per
|
||||
definition -- the average of one element matches the value of that
|
||||
element. Therefore place_entity() will match the condition skipping
|
||||
the lag adjustment:
|
||||
|
||||
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
|
||||
|
||||
Without the 'se->vlag' condition -- it will attempt to adjust the zero
|
||||
lag even though we're inserting into an empty tree.
|
||||
|
||||
Notably, we should have failed the 'cfs_rq->nr_queued' condition, but
|
||||
don't because they didn't get updated.
|
||||
|
||||
Additionally, move update_load_add() after placement() as is
|
||||
consistent with other place_entity() users -- this change is
|
||||
non-functional, place_entity() does not use cfs_rq->load.
|
||||
|
||||
Fixes: 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Reported-by: Mike Galbraith <efault@gmx.de>
|
||||
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
|
||||
Signed-off-by: Mike Galbraith <efault@gmx.de>
|
||||
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lore.kernel.org/r/c216eb4ef0e0e0029c600aefc69d56681cee5581.camel@gmx.de
|
||||
---
|
||||
kernel/sched/fair.c | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -3808,6 +3808,7 @@ static void reweight_entity(struct cfs_r
|
||||
update_entity_lag(cfs_rq, se);
|
||||
se->deadline -= se->vruntime;
|
||||
se->rel_deadline = 1;
|
||||
+ cfs_rq->nr_queued--;
|
||||
if (!curr)
|
||||
__dequeue_entity(cfs_rq, se);
|
||||
update_load_sub(&cfs_rq->load, se->load.weight);
|
||||
@@ -3834,10 +3835,11 @@ static void reweight_entity(struct cfs_r
|
||||
|
||||
enqueue_load_avg(cfs_rq, se);
|
||||
if (se->on_rq) {
|
||||
- update_load_add(&cfs_rq->load, se->load.weight);
|
||||
place_entity(cfs_rq, se, 0);
|
||||
+ update_load_add(&cfs_rq->load, se->load.weight);
|
||||
if (!curr)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
+ cfs_rq->nr_queued++;
|
||||
|
||||
/*
|
||||
* The entity's vruntime has been adjusted, so let's check
|
184
debian/patches/patchset-pf/fixes/0028-alloc_tag-handle-module-codetag-load-errors-as-modul.patch
vendored
Normal file
184
debian/patches/patchset-pf/fixes/0028-alloc_tag-handle-module-codetag-load-errors-as-modul.patch
vendored
Normal file
@@ -0,0 +1,184 @@
|
||||
From 65419a1e04de111460c4f38c47f1db39e71c3357 Mon Sep 17 00:00:00 2001
|
||||
From: Suren Baghdasaryan <surenb@google.com>
|
||||
Date: Wed, 21 May 2025 09:06:02 -0700
|
||||
Subject: alloc_tag: handle module codetag load errors as module load failures
|
||||
|
||||
Failures inside codetag_load_module() are currently ignored. As a result
|
||||
an error there would not cause a module load failure and freeing of the
|
||||
associated resources. Correct this behavior by propagating the error code
|
||||
to the caller and handling possible errors. With this change, error to
|
||||
allocate percpu counters, which happens at this stage, will not be ignored
|
||||
and will cause a module load failure and freeing of resources. With this
|
||||
change we also do not need to disable memory allocation profiling when
|
||||
this error happens, instead we fail to load the module.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250521160602.1940771-1-surenb@google.com
|
||||
Fixes: 10075262888b ("alloc_tag: allocate percpu counters for module tags dynamically")
|
||||
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
|
||||
Reported-by: Casey Chen <cachen@purestorage.com>
|
||||
Closes: https://lore.kernel.org/all/20250520231620.15259-1-cachen@purestorage.com/
|
||||
Cc: Daniel Gomez <da.gomez@samsung.com>
|
||||
Cc: David Wang <00107082@163.com>
|
||||
Cc: Kent Overstreet <kent.overstreet@linux.dev>
|
||||
Cc: Luis Chamberalin <mcgrof@kernel.org>
|
||||
Cc: Petr Pavlu <petr.pavlu@suse.com>
|
||||
Cc: Sami Tolvanen <samitolvanen@google.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/codetag.h | 8 ++++----
|
||||
kernel/module/main.c | 5 +++--
|
||||
lib/alloc_tag.c | 12 +++++++-----
|
||||
lib/codetag.c | 34 +++++++++++++++++++++++++---------
|
||||
4 files changed, 39 insertions(+), 20 deletions(-)
|
||||
|
||||
--- a/include/linux/codetag.h
|
||||
+++ b/include/linux/codetag.h
|
||||
@@ -36,8 +36,8 @@ union codetag_ref {
|
||||
struct codetag_type_desc {
|
||||
const char *section;
|
||||
size_t tag_size;
|
||||
- void (*module_load)(struct module *mod,
|
||||
- struct codetag *start, struct codetag *end);
|
||||
+ int (*module_load)(struct module *mod,
|
||||
+ struct codetag *start, struct codetag *end);
|
||||
void (*module_unload)(struct module *mod,
|
||||
struct codetag *start, struct codetag *end);
|
||||
#ifdef CONFIG_MODULES
|
||||
@@ -89,7 +89,7 @@ void *codetag_alloc_module_section(struc
|
||||
unsigned long align);
|
||||
void codetag_free_module_sections(struct module *mod);
|
||||
void codetag_module_replaced(struct module *mod, struct module *new_mod);
|
||||
-void codetag_load_module(struct module *mod);
|
||||
+int codetag_load_module(struct module *mod);
|
||||
void codetag_unload_module(struct module *mod);
|
||||
|
||||
#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
|
||||
@@ -103,7 +103,7 @@ codetag_alloc_module_section(struct modu
|
||||
unsigned long align) { return NULL; }
|
||||
static inline void codetag_free_module_sections(struct module *mod) {}
|
||||
static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {}
|
||||
-static inline void codetag_load_module(struct module *mod) {}
|
||||
+static inline int codetag_load_module(struct module *mod) { return 0; }
|
||||
static inline void codetag_unload_module(struct module *mod) {}
|
||||
|
||||
#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */
|
||||
--- a/kernel/module/main.c
|
||||
+++ b/kernel/module/main.c
|
||||
@@ -3399,11 +3399,12 @@ static int load_module(struct load_info
|
||||
goto sysfs_cleanup;
|
||||
}
|
||||
|
||||
+ if (codetag_load_module(mod))
|
||||
+ goto sysfs_cleanup;
|
||||
+
|
||||
/* Get rid of temporary copy. */
|
||||
free_copy(info, flags);
|
||||
|
||||
- codetag_load_module(mod);
|
||||
-
|
||||
/* Done! */
|
||||
trace_module_load(mod);
|
||||
|
||||
--- a/lib/alloc_tag.c
|
||||
+++ b/lib/alloc_tag.c
|
||||
@@ -618,15 +618,16 @@ out:
|
||||
mas_unlock(&mas);
|
||||
}
|
||||
|
||||
-static void load_module(struct module *mod, struct codetag *start, struct codetag *stop)
|
||||
+static int load_module(struct module *mod, struct codetag *start, struct codetag *stop)
|
||||
{
|
||||
/* Allocate module alloc_tag percpu counters */
|
||||
struct alloc_tag *start_tag;
|
||||
struct alloc_tag *stop_tag;
|
||||
struct alloc_tag *tag;
|
||||
|
||||
+ /* percpu counters for core allocations are already statically allocated */
|
||||
if (!mod)
|
||||
- return;
|
||||
+ return 0;
|
||||
|
||||
start_tag = ct_to_alloc_tag(start);
|
||||
stop_tag = ct_to_alloc_tag(stop);
|
||||
@@ -638,12 +639,13 @@ static void load_module(struct module *m
|
||||
free_percpu(tag->counters);
|
||||
tag->counters = NULL;
|
||||
}
|
||||
- shutdown_mem_profiling(true);
|
||||
- pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s. Memory allocation profiling is disabled!\n",
|
||||
+ pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n",
|
||||
mod->name);
|
||||
- break;
|
||||
+ return -ENOMEM;
|
||||
}
|
||||
}
|
||||
+
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static void replace_module(struct module *mod, struct module *new_mod)
|
||||
--- a/lib/codetag.c
|
||||
+++ b/lib/codetag.c
|
||||
@@ -167,6 +167,7 @@ static int codetag_module_init(struct co
|
||||
{
|
||||
struct codetag_range range;
|
||||
struct codetag_module *cmod;
|
||||
+ int mod_id;
|
||||
int err;
|
||||
|
||||
range = get_section_range(mod, cttype->desc.section);
|
||||
@@ -190,11 +191,20 @@ static int codetag_module_init(struct co
|
||||
cmod->range = range;
|
||||
|
||||
down_write(&cttype->mod_lock);
|
||||
- err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
|
||||
- if (err >= 0) {
|
||||
- cttype->count += range_size(cttype, &range);
|
||||
- if (cttype->desc.module_load)
|
||||
- cttype->desc.module_load(mod, range.start, range.stop);
|
||||
+ mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
|
||||
+ if (mod_id >= 0) {
|
||||
+ if (cttype->desc.module_load) {
|
||||
+ err = cttype->desc.module_load(mod, range.start, range.stop);
|
||||
+ if (!err)
|
||||
+ cttype->count += range_size(cttype, &range);
|
||||
+ else
|
||||
+ idr_remove(&cttype->mod_idr, mod_id);
|
||||
+ } else {
|
||||
+ cttype->count += range_size(cttype, &range);
|
||||
+ err = 0;
|
||||
+ }
|
||||
+ } else {
|
||||
+ err = mod_id;
|
||||
}
|
||||
up_write(&cttype->mod_lock);
|
||||
|
||||
@@ -295,17 +305,23 @@ void codetag_module_replaced(struct modu
|
||||
mutex_unlock(&codetag_lock);
|
||||
}
|
||||
|
||||
-void codetag_load_module(struct module *mod)
|
||||
+int codetag_load_module(struct module *mod)
|
||||
{
|
||||
struct codetag_type *cttype;
|
||||
+ int ret = 0;
|
||||
|
||||
if (!mod)
|
||||
- return;
|
||||
+ return 0;
|
||||
|
||||
mutex_lock(&codetag_lock);
|
||||
- list_for_each_entry(cttype, &codetag_types, link)
|
||||
- codetag_module_init(cttype, mod);
|
||||
+ list_for_each_entry(cttype, &codetag_types, link) {
|
||||
+ ret = codetag_module_init(cttype, mod);
|
||||
+ if (ret)
|
||||
+ break;
|
||||
+ }
|
||||
mutex_unlock(&codetag_lock);
|
||||
+
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
void codetag_unload_module(struct module *mod)
|
29
debian/patches/patchset-pf/fixes/0029-svcrdma-Unregister-the-device-if-svc_rdma_accept-fai.patch
vendored
Normal file
29
debian/patches/patchset-pf/fixes/0029-svcrdma-Unregister-the-device-if-svc_rdma_accept-fai.patch
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
From 3848ddd6068c425b732da6e8c78b047ed28c6114 Mon Sep 17 00:00:00 2001
|
||||
From: Chuck Lever <chuck.lever@oracle.com>
|
||||
Date: Sun, 27 Apr 2025 12:39:59 -0400
|
||||
Subject: svcrdma: Unregister the device if svc_rdma_accept() fails
|
||||
|
||||
To handle device removal, svc_rdma_accept() requests removal
|
||||
notification for the underlying device when accepting a connection.
|
||||
However svc_rdma_free() is not invoked if svc_rdma_accept() fails.
|
||||
There needs to be a matching "unregister" in that case; otherwise
|
||||
the device cannot be removed.
|
||||
|
||||
Fixes: c4de97f7c454 ("svcrdma: Handle device removal outside of the CM event handler")
|
||||
Cc: stable@vger.kernel.org
|
||||
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
---
|
||||
net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
|
||||
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
|
||||
@@ -575,6 +575,7 @@ static struct svc_xprt *svc_rdma_accept(
|
||||
if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
|
||||
ib_destroy_qp(newxprt->sc_qp);
|
||||
rdma_destroy_id(newxprt->sc_cm_id);
|
||||
+ rpcrdma_rn_unregister(dev, &newxprt->sc_rn);
|
||||
/* This call to put will destroy the transport */
|
||||
svc_xprt_put(&newxprt->sc_xprt);
|
||||
return NULL;
|
53
debian/patches/patchset-pf/fixes/0030-SUNRPC-Prevent-hang-on-NFS-mount-with-xprtsec-m-tls.patch
vendored
Normal file
53
debian/patches/patchset-pf/fixes/0030-SUNRPC-Prevent-hang-on-NFS-mount-with-xprtsec-m-tls.patch
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
From 38b409dd5c2fd9496fde05db4fb538a7e3593922 Mon Sep 17 00:00:00 2001
|
||||
From: Chuck Lever <chuck.lever@oracle.com>
|
||||
Date: Wed, 21 May 2025 16:34:13 -0400
|
||||
Subject: SUNRPC: Prevent hang on NFS mount with xprtsec=[m]tls
|
||||
|
||||
Engineers at Hammerspace noticed that sometimes mounting with
|
||||
"xprtsec=tls" hangs for a minute or so, and then times out, even
|
||||
when the NFS server is reachable and responsive.
|
||||
|
||||
kTLS shuts off data_ready callbacks if strp->msg_ready is set to
|
||||
mitigate data_ready callbacks when a full TLS record is not yet
|
||||
ready to be read from the socket.
|
||||
|
||||
Normally msg_ready is clear when the first TLS record arrives on
|
||||
a socket. However, I observed that sometimes tls_setsockopt() sets
|
||||
strp->msg_ready, and that prevents forward progress because
|
||||
tls_data_ready() becomes a no-op.
|
||||
|
||||
Moreover, Jakub says: "If there's a full record queued at the time
|
||||
when [tlshd] passes the socket back to the kernel, it's up to the
|
||||
reader to read the already queued data out." So SunRPC cannot
|
||||
expect a data_ready call when ingress data is already waiting.
|
||||
|
||||
Add an explicit poll after SunRPC's upper transport is set up to
|
||||
pick up any data that arrived after the TLS handshake but before
|
||||
transport set-up is complete.
|
||||
|
||||
Reported-by: Steve Sears <sjs@hammerspace.com>
|
||||
Suggested-by: Jakub Kacinski <kuba@kernel.org>
|
||||
Fixes: 75eb6af7acdf ("SUNRPC: Add a TCP-with-TLS RPC transport class")
|
||||
Tested-by: Mike Snitzer <snitzer@kernel.org>
|
||||
Reviewed-by: Mike Snitzer <snitzer@kernel.org>
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
|
||||
---
|
||||
net/sunrpc/xprtsock.c | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
--- a/net/sunrpc/xprtsock.c
|
||||
+++ b/net/sunrpc/xprtsock.c
|
||||
@@ -2740,6 +2740,11 @@ static void xs_tcp_tls_setup_socket(stru
|
||||
}
|
||||
rpc_shutdown_client(lower_clnt);
|
||||
|
||||
+ /* Check for ingress data that arrived before the socket's
|
||||
+ * ->data_ready callback was set up.
|
||||
+ */
|
||||
+ xs_poll_check_readable(upper_transport);
|
||||
+
|
||||
out_unlock:
|
||||
current_restore_flags(pflags, PF_MEMALLOC);
|
||||
upper_transport->clnt = NULL;
|
89
debian/patches/patchset-pf/fixes/0031-hv_netvsc-fix-potential-deadlock-in-netvsc_vf_setxdp.patch
vendored
Normal file
89
debian/patches/patchset-pf/fixes/0031-hv_netvsc-fix-potential-deadlock-in-netvsc_vf_setxdp.patch
vendored
Normal file
@@ -0,0 +1,89 @@
|
||||
From c3e0e5bd29d97f8e5663026e8c2f25e08f1c4544 Mon Sep 17 00:00:00 2001
|
||||
From: Saurabh Sengar <ssengar@linux.microsoft.com>
|
||||
Date: Thu, 29 May 2025 03:18:30 -0700
|
||||
Subject: hv_netvsc: fix potential deadlock in netvsc_vf_setxdp()
|
||||
|
||||
The MANA driver's probe registers netdevice via the following call chain:
|
||||
|
||||
mana_probe()
|
||||
register_netdev()
|
||||
register_netdevice()
|
||||
|
||||
register_netdevice() calls notifier callback for netvsc driver,
|
||||
holding the netdev mutex via netdev_lock_ops().
|
||||
|
||||
Further this netvsc notifier callback end up attempting to acquire the
|
||||
same lock again in dev_xdp_propagate() leading to deadlock.
|
||||
|
||||
netvsc_netdev_event()
|
||||
netvsc_vf_setxdp()
|
||||
dev_xdp_propagate()
|
||||
|
||||
This deadlock was not observed so far because net_shaper_ops was never set,
|
||||
and thus the lock was effectively a no-op in this case. Fix this by using
|
||||
netif_xdp_propagate() instead of dev_xdp_propagate() to avoid recursive
|
||||
locking in this path.
|
||||
|
||||
And, since no deadlock is observed on the other path which is via
|
||||
netvsc_probe, add the lock exclusivly for that path.
|
||||
|
||||
Also, clean up the unregistration path by removing the unnecessary call to
|
||||
netvsc_vf_setxdp(), since unregister_netdevice_many_notify() already
|
||||
performs this cleanup via dev_xdp_uninstall().
|
||||
|
||||
Fixes: 97246d6d21c2 ("net: hold netdev instance lock during ndo_bpf")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
|
||||
Tested-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
|
||||
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
|
||||
Reviewed-by: Subbaraya Sundeep <sbhatta@marvell.com>
|
||||
Link: https://patch.msgid.link/1748513910-23963-1-git-send-email-ssengar@linux.microsoft.com
|
||||
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
||||
---
|
||||
drivers/net/hyperv/netvsc_bpf.c | 2 +-
|
||||
drivers/net/hyperv/netvsc_drv.c | 4 ++--
|
||||
net/core/dev.c | 1 +
|
||||
3 files changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/drivers/net/hyperv/netvsc_bpf.c
|
||||
+++ b/drivers/net/hyperv/netvsc_bpf.c
|
||||
@@ -183,7 +183,7 @@ int netvsc_vf_setxdp(struct net_device *
|
||||
xdp.command = XDP_SETUP_PROG;
|
||||
xdp.prog = prog;
|
||||
|
||||
- ret = dev_xdp_propagate(vf_netdev, &xdp);
|
||||
+ ret = netif_xdp_propagate(vf_netdev, &xdp);
|
||||
|
||||
if (ret && prog)
|
||||
bpf_prog_put(prog);
|
||||
--- a/drivers/net/hyperv/netvsc_drv.c
|
||||
+++ b/drivers/net/hyperv/netvsc_drv.c
|
||||
@@ -2462,8 +2462,6 @@ static int netvsc_unregister_vf(struct n
|
||||
|
||||
netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
|
||||
|
||||
- netvsc_vf_setxdp(vf_netdev, NULL);
|
||||
-
|
||||
reinit_completion(&net_device_ctx->vf_add);
|
||||
netdev_rx_handler_unregister(vf_netdev);
|
||||
netdev_upper_dev_unlink(vf_netdev, ndev);
|
||||
@@ -2631,7 +2629,9 @@ static int netvsc_probe(struct hv_device
|
||||
continue;
|
||||
|
||||
netvsc_prepare_bonding(vf_netdev);
|
||||
+ netdev_lock_ops(vf_netdev);
|
||||
netvsc_register_vf(vf_netdev, VF_REG_IN_PROBE);
|
||||
+ netdev_unlock_ops(vf_netdev);
|
||||
__netvsc_vf_setup(net, vf_netdev);
|
||||
break;
|
||||
}
|
||||
--- a/net/core/dev.c
|
||||
+++ b/net/core/dev.c
|
||||
@@ -9863,6 +9863,7 @@ int netif_xdp_propagate(struct net_devic
|
||||
|
||||
return dev->netdev_ops->ndo_bpf(dev, bpf);
|
||||
}
|
||||
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
|
||||
|
||||
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
|
||||
{
|
113
debian/patches/patchset-pf/fixes/0032-net-clear-the-dst-when-changing-skb-protocol.patch
vendored
Normal file
113
debian/patches/patchset-pf/fixes/0032-net-clear-the-dst-when-changing-skb-protocol.patch
vendored
Normal file
@@ -0,0 +1,113 @@
|
||||
From 0f48fca427618cecf6683fa8e46cb8d0b66bb93d Mon Sep 17 00:00:00 2001
|
||||
From: Jakub Kicinski <kuba@kernel.org>
|
||||
Date: Mon, 9 Jun 2025 17:12:44 -0700
|
||||
Subject: net: clear the dst when changing skb protocol
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
A not-so-careful NAT46 BPF program can crash the kernel
|
||||
if it indiscriminately flips ingress packets from v4 to v6:
|
||||
|
||||
BUG: kernel NULL pointer dereference, address: 0000000000000000
|
||||
ip6_rcv_core (net/ipv6/ip6_input.c:190:20)
|
||||
ipv6_rcv (net/ipv6/ip6_input.c:306:8)
|
||||
process_backlog (net/core/dev.c:6186:4)
|
||||
napi_poll (net/core/dev.c:6906:9)
|
||||
net_rx_action (net/core/dev.c:7028:13)
|
||||
do_softirq (kernel/softirq.c:462:3)
|
||||
netif_rx (net/core/dev.c:5326:3)
|
||||
dev_loopback_xmit (net/core/dev.c:4015:2)
|
||||
ip_mc_finish_output (net/ipv4/ip_output.c:363:8)
|
||||
NF_HOOK (./include/linux/netfilter.h:314:9)
|
||||
ip_mc_output (net/ipv4/ip_output.c:400:5)
|
||||
dst_output (./include/net/dst.h:459:9)
|
||||
ip_local_out (net/ipv4/ip_output.c:130:9)
|
||||
ip_send_skb (net/ipv4/ip_output.c:1496:8)
|
||||
udp_send_skb (net/ipv4/udp.c:1040:8)
|
||||
udp_sendmsg (net/ipv4/udp.c:1328:10)
|
||||
|
||||
The output interface has a 4->6 program attached at ingress.
|
||||
We try to loop the multicast skb back to the sending socket.
|
||||
Ingress BPF runs as part of netif_rx(), pushes a valid v6 hdr
|
||||
and changes skb->protocol to v6. We enter ip6_rcv_core which
|
||||
tries to use skb_dst(). But the dst is still an IPv4 one left
|
||||
after IPv4 mcast output.
|
||||
|
||||
Clear the dst in all BPF helpers which change the protocol.
|
||||
Try to preserve metadata dsts, those may carry non-routing
|
||||
metadata.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Reviewed-by: Maciej Żenczykowski <maze@google.com>
|
||||
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
|
||||
Fixes: d219df60a70e ("bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()")
|
||||
Fixes: 1b00e0dfe7d0 ("bpf: update skb->protocol in bpf_skb_net_grow")
|
||||
Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
|
||||
Reviewed-by: Willem de Bruijn <willemb@google.com>
|
||||
Link: https://patch.msgid.link/20250610001245.1981782-1-kuba@kernel.org
|
||||
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
||||
---
|
||||
net/core/filter.c | 19 +++++++++++++------
|
||||
1 file changed, 13 insertions(+), 6 deletions(-)
|
||||
|
||||
--- a/net/core/filter.c
|
||||
+++ b/net/core/filter.c
|
||||
@@ -3232,6 +3232,13 @@ static const struct bpf_func_proto bpf_s
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
};
|
||||
|
||||
+static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
|
||||
+{
|
||||
+ skb->protocol = htons(proto);
|
||||
+ if (skb_valid_dst(skb))
|
||||
+ skb_dst_drop(skb);
|
||||
+}
|
||||
+
|
||||
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
|
||||
{
|
||||
/* Caller already did skb_cow() with len as headroom,
|
||||
@@ -3328,7 +3335,7 @@ static int bpf_skb_proto_4_to_6(struct s
|
||||
}
|
||||
}
|
||||
|
||||
- skb->protocol = htons(ETH_P_IPV6);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
|
||||
skb_clear_hash(skb);
|
||||
|
||||
return 0;
|
||||
@@ -3358,7 +3365,7 @@ static int bpf_skb_proto_6_to_4(struct s
|
||||
}
|
||||
}
|
||||
|
||||
- skb->protocol = htons(ETH_P_IP);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IP);
|
||||
skb_clear_hash(skb);
|
||||
|
||||
return 0;
|
||||
@@ -3549,10 +3556,10 @@ static int bpf_skb_net_grow(struct sk_bu
|
||||
/* Match skb->protocol to new outer l3 protocol */
|
||||
if (skb->protocol == htons(ETH_P_IP) &&
|
||||
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
|
||||
- skb->protocol = htons(ETH_P_IPV6);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
|
||||
else if (skb->protocol == htons(ETH_P_IPV6) &&
|
||||
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
|
||||
- skb->protocol = htons(ETH_P_IP);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IP);
|
||||
}
|
||||
|
||||
if (skb_is_gso(skb)) {
|
||||
@@ -3605,10 +3612,10 @@ static int bpf_skb_net_shrink(struct sk_
|
||||
/* Match skb->protocol to new outer l3 protocol */
|
||||
if (skb->protocol == htons(ETH_P_IP) &&
|
||||
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
|
||||
- skb->protocol = htons(ETH_P_IPV6);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
|
||||
else if (skb->protocol == htons(ETH_P_IPV6) &&
|
||||
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
|
||||
- skb->protocol = htons(ETH_P_IP);
|
||||
+ bpf_skb_change_protocol(skb, ETH_P_IP);
|
||||
|
||||
if (skb_is_gso(skb)) {
|
||||
struct skb_shared_info *shinfo = skb_shinfo(skb);
|
67
debian/patches/patchset-pf/fixes/0033-net_sched-sch_sfq-reject-invalid-perturb-period.patch
vendored
Normal file
67
debian/patches/patchset-pf/fixes/0033-net_sched-sch_sfq-reject-invalid-perturb-period.patch
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
From 59765af017c206b162b2ceb8d56a171e40a17719 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Dumazet <edumazet@google.com>
|
||||
Date: Wed, 11 Jun 2025 08:35:01 +0000
|
||||
Subject: net_sched: sch_sfq: reject invalid perturb period
|
||||
|
||||
Gerrard Tai reported that SFQ perturb_period has no range check yet,
|
||||
and this can be used to trigger a race condition fixed in a separate patch.
|
||||
|
||||
We want to make sure ctl->perturb_period * HZ will not overflow
|
||||
and is positive.
|
||||
|
||||
Tested:
|
||||
|
||||
tc qd add dev lo root sfq perturb -10 # negative value : error
|
||||
Error: sch_sfq: invalid perturb period.
|
||||
|
||||
tc qd add dev lo root sfq perturb 1000000000 # too big : error
|
||||
Error: sch_sfq: invalid perturb period.
|
||||
|
||||
tc qd add dev lo root sfq perturb 2000000 # acceptable value
|
||||
tc -s -d qd sh dev lo
|
||||
qdisc sfq 8005: root refcnt 2 limit 127p quantum 64Kb depth 127 flows 128 divisor 1024 perturb 2000000sec
|
||||
Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
|
||||
backlog 0b 0p requeues 0
|
||||
|
||||
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
|
||||
Reported-by: Gerrard Tai <gerrard.tai@starlabs.sg>
|
||||
Signed-off-by: Eric Dumazet <edumazet@google.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://patch.msgid.link/20250611083501.1810459-1-edumazet@google.com
|
||||
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
||||
---
|
||||
net/sched/sch_sfq.c | 10 ++++++++--
|
||||
1 file changed, 8 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/net/sched/sch_sfq.c
|
||||
+++ b/net/sched/sch_sfq.c
|
||||
@@ -653,6 +653,14 @@ static int sfq_change(struct Qdisc *sch,
|
||||
NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
|
||||
return -EINVAL;
|
||||
}
|
||||
+
|
||||
+ if (ctl->perturb_period < 0 ||
|
||||
+ ctl->perturb_period > INT_MAX / HZ) {
|
||||
+ NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+ perturb_period = ctl->perturb_period * HZ;
|
||||
+
|
||||
if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
|
||||
ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
|
||||
return -EINVAL;
|
||||
@@ -669,14 +677,12 @@ static int sfq_change(struct Qdisc *sch,
|
||||
headdrop = q->headdrop;
|
||||
maxdepth = q->maxdepth;
|
||||
maxflows = q->maxflows;
|
||||
- perturb_period = q->perturb_period;
|
||||
quantum = q->quantum;
|
||||
flags = q->flags;
|
||||
|
||||
/* update and validate configuration */
|
||||
if (ctl->quantum)
|
||||
quantum = ctl->quantum;
|
||||
- perturb_period = ctl->perturb_period * HZ;
|
||||
if (ctl->flows)
|
||||
maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
|
||||
if (ctl->divisor) {
|
51
debian/patches/patchset-pf/fixes/0034-posix-cpu-timers-fix-race-between-handle_posix_cpu_t.patch
vendored
Normal file
51
debian/patches/patchset-pf/fixes/0034-posix-cpu-timers-fix-race-between-handle_posix_cpu_t.patch
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
From b504e1cd491c55390370059280d5fbaa045d5543 Mon Sep 17 00:00:00 2001
|
||||
From: Oleg Nesterov <oleg@redhat.com>
|
||||
Date: Fri, 13 Jun 2025 19:26:50 +0200
|
||||
Subject: posix-cpu-timers: fix race between handle_posix_cpu_timers() and
|
||||
posix_cpu_timer_del()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
If an exiting non-autoreaping task has already passed exit_notify() and
|
||||
calls handle_posix_cpu_timers() from IRQ, it can be reaped by its parent
|
||||
or debugger right after unlock_task_sighand().
|
||||
|
||||
If a concurrent posix_cpu_timer_del() runs at that moment, it won't be
|
||||
able to detect timer->it.cpu.firing != 0: cpu_timer_task_rcu() and/or
|
||||
lock_task_sighand() will fail.
|
||||
|
||||
Add the tsk->exit_state check into run_posix_cpu_timers() to fix this.
|
||||
|
||||
This fix is not needed if CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y, because
|
||||
exit_task_work() is called before exit_notify(). But the check still
|
||||
makes sense, task_work_add(&tsk->posix_cputimers_work.work) will fail
|
||||
anyway in this case.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Reported-by: Benoît Sevens <bsevens@google.com>
|
||||
Fixes: 0bdd2ed4138e ("sched: run_posix_cpu_timers: Don't check ->exit_state, use lock_task_sighand()")
|
||||
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
|
||||
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
---
|
||||
kernel/time/posix-cpu-timers.c | 9 +++++++++
|
||||
1 file changed, 9 insertions(+)
|
||||
|
||||
--- a/kernel/time/posix-cpu-timers.c
|
||||
+++ b/kernel/time/posix-cpu-timers.c
|
||||
@@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void)
|
||||
lockdep_assert_irqs_disabled();
|
||||
|
||||
/*
|
||||
+ * Ensure that release_task(tsk) can't happen while
|
||||
+ * handle_posix_cpu_timers() is running. Otherwise, a concurrent
|
||||
+ * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
|
||||
+ * miss timer->it.cpu.firing != 0.
|
||||
+ */
|
||||
+ if (tsk->exit_state)
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
* If the actual expiry is deferred to task work context and the
|
||||
* work is already scheduled there is no point to do anything here.
|
||||
*/
|
93
debian/patches/patchset-pf/fixes/0035-mm-vma-reset-VMA-iterator-on-commit_merge-OOM-failur.patch
vendored
Normal file
93
debian/patches/patchset-pf/fixes/0035-mm-vma-reset-VMA-iterator-on-commit_merge-OOM-failur.patch
vendored
Normal file
@@ -0,0 +1,93 @@
|
||||
From d7b5f2aa34c56bd2a2d3cda2a7eb7aeb24df6179 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Date: Fri, 6 Jun 2025 13:50:32 +0100
|
||||
Subject: mm/vma: reset VMA iterator on commit_merge() OOM failure
|
||||
|
||||
While an OOM failure in commit_merge() isn't really feasible due to the
|
||||
allocation which might fail (a maple tree pre-allocation) being 'too small
|
||||
to fail', we do need to handle this case correctly regardless.
|
||||
|
||||
In vma_merge_existing_range(), we can theoretically encounter failures
|
||||
which result in an OOM error in two ways - firstly dup_anon_vma() might
|
||||
fail with an OOM error, and secondly commit_merge() failing, ultimately,
|
||||
to pre-allocate a maple tree node.
|
||||
|
||||
The abort logic for dup_anon_vma() resets the VMA iterator to the initial
|
||||
range, ensuring that any logic looping on this iterator will correctly
|
||||
proceed to the next VMA.
|
||||
|
||||
However the commit_merge() abort logic does not do the same thing. This
|
||||
resulted in a syzbot report occurring because mlockall() iterates through
|
||||
VMAs, is tolerant of errors, but ended up with an incorrect previous VMA
|
||||
being specified due to incorrect iterator state.
|
||||
|
||||
While making this change, it became apparent we are duplicating logic -
|
||||
the logic introduced in commit 41e6ddcaa0f1 ("mm/vma: add give_up_on_oom
|
||||
option on modify/merge, use in uffd release") duplicates the
|
||||
vmg->give_up_on_oom check in both abort branches.
|
||||
|
||||
Additionally, we observe that we can perform the anon_dup check safely on
|
||||
dup_anon_vma() failure, as this will not be modified should this call
|
||||
fail.
|
||||
|
||||
Finally, we need to reset the iterator in both cases, so now we can simply
|
||||
use the exact same code to abort for both.
|
||||
|
||||
We remove the VM_WARN_ON(err != -ENOMEM) as it would be silly for this to
|
||||
be otherwise and it allows us to implement the abort check more neatly.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250606125032.164249-1-lorenzo.stoakes@oracle.com
|
||||
Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure")
|
||||
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Reported-by: syzbot+d16409ea9ecc16ed261a@syzkaller.appspotmail.com
|
||||
Closes: https://lore.kernel.org/linux-mm/6842cc67.a00a0220.29ac89.003b.GAE@google.com/
|
||||
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
|
||||
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
|
||||
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
|
||||
Cc: Jann Horn <jannh@google.com>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vma.c | 22 ++++------------------
|
||||
1 file changed, 4 insertions(+), 18 deletions(-)
|
||||
|
||||
--- a/mm/vma.c
|
||||
+++ b/mm/vma.c
|
||||
@@ -927,26 +927,9 @@ static __must_check struct vm_area_struc
|
||||
err = dup_anon_vma(next, middle, &anon_dup);
|
||||
}
|
||||
|
||||
- if (err)
|
||||
+ if (err || commit_merge(vmg))
|
||||
goto abort;
|
||||
|
||||
- err = commit_merge(vmg);
|
||||
- if (err) {
|
||||
- VM_WARN_ON(err != -ENOMEM);
|
||||
-
|
||||
- if (anon_dup)
|
||||
- unlink_anon_vmas(anon_dup);
|
||||
-
|
||||
- /*
|
||||
- * We've cleaned up any cloned anon_vma's, no VMAs have been
|
||||
- * modified, no harm no foul if the user requests that we not
|
||||
- * report this and just give up, leaving the VMAs unmerged.
|
||||
- */
|
||||
- if (!vmg->give_up_on_oom)
|
||||
- vmg->state = VMA_MERGE_ERROR_NOMEM;
|
||||
- return NULL;
|
||||
- }
|
||||
-
|
||||
khugepaged_enter_vma(vmg->target, vmg->flags);
|
||||
vmg->state = VMA_MERGE_SUCCESS;
|
||||
return vmg->target;
|
||||
@@ -955,6 +938,9 @@ abort:
|
||||
vma_iter_set(vmg->vmi, start);
|
||||
vma_iter_load(vmg->vmi);
|
||||
|
||||
+ if (anon_dup)
|
||||
+ unlink_anon_vmas(anon_dup);
|
||||
+
|
||||
/*
|
||||
* This means we have failed to clone anon_vma's correctly, but no
|
||||
* actual changes to VMAs have occurred, so no harm no foul - if the
|
90
debian/patches/patchset-pf/fixes/0036-mm-close-theoretical-race-where-stale-TLB-entries-co.patch
vendored
Normal file
90
debian/patches/patchset-pf/fixes/0036-mm-close-theoretical-race-where-stale-TLB-entries-co.patch
vendored
Normal file
@@ -0,0 +1,90 @@
|
||||
From db96fe27668a3bb56fa5d745d1c2eed49a95a56f Mon Sep 17 00:00:00 2001
|
||||
From: Ryan Roberts <ryan.roberts@arm.com>
|
||||
Date: Fri, 6 Jun 2025 10:28:07 +0100
|
||||
Subject: mm: close theoretical race where stale TLB entries could linger
|
||||
|
||||
Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a
|
||||
parallel reclaim leaving stale TLB entries") described a theoretical race
|
||||
as such:
|
||||
|
||||
|
||||
"""
|
||||
Nadav Amit identified a theoretical race between page reclaim and mprotect
|
||||
due to TLB flushes being batched outside of the PTL being held.
|
||||
|
||||
He described the race as follows:
|
||||
|
||||
CPU0 CPU1
|
||||
---- ----
|
||||
user accesses memory using RW PTE
|
||||
[PTE now cached in TLB]
|
||||
try_to_unmap_one()
|
||||
==> ptep_get_and_clear()
|
||||
==> set_tlb_ubc_flush_pending()
|
||||
mprotect(addr, PROT_READ)
|
||||
==> change_pte_range()
|
||||
==> [ PTE non-present - no flush ]
|
||||
|
||||
user writes using cached RW PTE
|
||||
...
|
||||
|
||||
try_to_unmap_flush()
|
||||
|
||||
The same type of race exists for reads when protecting for PROT_NONE and
|
||||
also exists for operations that can leave an old TLB entry behind such as
|
||||
munmap, mremap and madvise.
|
||||
"""
|
||||
|
||||
The solution was to introduce flush_tlb_batched_pending() and call it
|
||||
under the PTL from mprotect/madvise/munmap/mremap to complete any pending
|
||||
tlb flushes.
|
||||
|
||||
However, while madvise_free_pte_range() and
|
||||
madvise_cold_or_pageout_pte_range() were both retro-fitted to call
|
||||
flush_tlb_batched_pending() immediately after initially acquiring the PTL,
|
||||
they both temporarily release the PTL to split a large folio if they
|
||||
stumble upon one. In this case, where re-acquiring the PTL
|
||||
flush_tlb_batched_pending() must be called again, but it previously was
|
||||
not. Let's fix that.
|
||||
|
||||
There are 2 Fixes: tags here: the first is the commit that fixed
|
||||
madvise_free_pte_range(). The second is the commit that added
|
||||
madvise_cold_or_pageout_pte_range(), which looks like it copy/pasted the
|
||||
faulty pattern from madvise_free_pte_range().
|
||||
|
||||
This is a theoretical bug discovered during code review.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20250606092809.4194056-1-ryan.roberts@arm.com
|
||||
Fixes: 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries")
|
||||
Fixes: 9c276cc65a58 ("mm: introduce MADV_COLD")
|
||||
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
|
||||
Reviewed-by: Jann Horn <jannh@google.com>
|
||||
Acked-by: David Hildenbrand <david@redhat.com>
|
||||
Cc: Liam Howlett <liam.howlett@oracle.com>
|
||||
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
||||
Cc: Mel Gorman <mgorman <mgorman@suse.de>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/madvise.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/mm/madvise.c
|
||||
+++ b/mm/madvise.c
|
||||
@@ -503,6 +503,7 @@ restart:
|
||||
pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
if (!start_pte)
|
||||
break;
|
||||
+ flush_tlb_batched_pending(mm);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
if (!err)
|
||||
nr = 0;
|
||||
@@ -736,6 +737,7 @@ static int madvise_free_pte_range(pmd_t
|
||||
start_pte = pte;
|
||||
if (!start_pte)
|
||||
break;
|
||||
+ flush_tlb_batched_pending(mm);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
if (!err)
|
||||
nr = 0;
|
33
debian/patches/patchset-pf/fixes/0037-io_uring-kbuf-don-t-truncate-end-buffer-for-multiple.patch
vendored
Normal file
33
debian/patches/patchset-pf/fixes/0037-io_uring-kbuf-don-t-truncate-end-buffer-for-multiple.patch
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
From f8c6b0801edd6f50057610c67120ffb42027f2c2 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Fri, 13 Jun 2025 11:01:49 -0600
|
||||
Subject: io_uring/kbuf: don't truncate end buffer for multiple buffer peeks
|
||||
|
||||
If peeking a bunch of buffers, normally io_ring_buffers_peek() will
|
||||
truncate the end buffer. This isn't optimal as presumably more data will
|
||||
be arriving later, and hence it's better to stop with the last full
|
||||
buffer rather than truncate the end buffer.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Fixes: 35c8711c8fc4 ("io_uring/kbuf: add helpers for getting/peeking multiple buffers")
|
||||
Reported-by: Christian Mazakas <christian.mazakas@gmail.com>
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
---
|
||||
io_uring/kbuf.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/io_uring/kbuf.c
|
||||
+++ b/io_uring/kbuf.c
|
||||
@@ -270,8 +270,11 @@ static int io_ring_buffers_peek(struct i
|
||||
/* truncate end piece, if needed, for non partial buffers */
|
||||
if (len > arg->max_len) {
|
||||
len = arg->max_len;
|
||||
- if (!(bl->flags & IOBL_INC))
|
||||
+ if (!(bl->flags & IOBL_INC)) {
|
||||
+ if (iov != arg->iovs)
|
||||
+ break;
|
||||
buf->len = len;
|
||||
+ }
|
||||
}
|
||||
|
||||
iov->iov_base = u64_to_user_ptr(buf->addr);
|
54
debian/patches/patchset-pf/fixes/0038-nvme-always-punt-polled-uring_cmd-end_io-work-to-tas.patch
vendored
Normal file
54
debian/patches/patchset-pf/fixes/0038-nvme-always-punt-polled-uring_cmd-end_io-work-to-tas.patch
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
From a2ef8773db38d0c3a41761dbed6fc57afa440161 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Fri, 13 Jun 2025 13:37:41 -0600
|
||||
Subject: nvme: always punt polled uring_cmd end_io work to task_work
|
||||
|
||||
Currently NVMe uring_cmd completions will complete locally, if they are
|
||||
polled. This is done because those completions are always invoked from
|
||||
task context. And while that is true, there's no guarantee that it's
|
||||
invoked under the right ring context, or even task. If someone does
|
||||
NVMe passthrough via multiple threads and with a limited number of
|
||||
poll queues, then ringA may find completions from ringB. For that case,
|
||||
completing the request may not be sound.
|
||||
|
||||
Always just punt the passthrough completions via task_work, which will
|
||||
redirect the completion, if needed.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Fixes: 585079b6e425 ("nvme: wire up async polling for io passthrough commands")
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
---
|
||||
drivers/nvme/host/ioctl.c | 21 +++++++--------------
|
||||
1 file changed, 7 insertions(+), 14 deletions(-)
|
||||
|
||||
--- a/drivers/nvme/host/ioctl.c
|
||||
+++ b/drivers/nvme/host/ioctl.c
|
||||
@@ -429,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd
|
||||
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
|
||||
/*
|
||||
- * For iopoll, complete it directly. Note that using the uring_cmd
|
||||
- * helper for this is safe only because we check blk_rq_is_poll().
|
||||
- * As that returns false if we're NOT on a polled queue, then it's
|
||||
- * safe to use the polled completion helper.
|
||||
- *
|
||||
- * Otherwise, move the completion to task work.
|
||||
+ * IOPOLL could potentially complete this request directly, but
|
||||
+ * if multiple rings are polling on the same queue, then it's possible
|
||||
+ * for one ring to find completions for another ring. Punting the
|
||||
+ * completion via task_work will always direct it to the right
|
||||
+ * location, rather than potentially complete requests for ringA
|
||||
+ * under iopoll invocations from ringB.
|
||||
*/
|
||||
- if (blk_rq_is_poll(req)) {
|
||||
- if (pdu->bio)
|
||||
- blk_rq_unmap_user(pdu->bio);
|
||||
- io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
|
||||
- } else {
|
||||
- io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
|
||||
- }
|
||||
-
|
||||
+ io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
|
||||
return RQ_END_IO_FREE;
|
||||
}
|
||||
|
33
debian/patches/patchset-pf/fixes/0039-block-Clear-BIO_EMULATES_ZONE_APPEND-flag-on-BIO-com.patch
vendored
Normal file
33
debian/patches/patchset-pf/fixes/0039-block-Clear-BIO_EMULATES_ZONE_APPEND-flag-on-BIO-com.patch
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
From bb51adf56b5adc7075252cd17136c2288c116602 Mon Sep 17 00:00:00 2001
|
||||
From: Damien Le Moal <dlemoal@kernel.org>
|
||||
Date: Wed, 11 Jun 2025 09:59:15 +0900
|
||||
Subject: block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion
|
||||
|
||||
When blk_zone_write_plug_bio_endio() is called for a regular write BIO
|
||||
used to emulate a zone append operation, that is, a BIO flagged with
|
||||
BIO_EMULATES_ZONE_APPEND, the BIO operation code is restored to the
|
||||
original REQ_OP_ZONE_APPEND but the BIO_EMULATES_ZONE_APPEND flag is not
|
||||
cleared. Clear it to fully return the BIO to its orginal definition.
|
||||
|
||||
Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
|
||||
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
|
||||
Reviewed-by: Hannes Reinecke <hare@suse.de>
|
||||
Reviewed-by: Christoph Hellwig <hch@lst.de>
|
||||
Link: https://lore.kernel.org/r/20250611005915.89843-1-dlemoal@kernel.org
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
---
|
||||
block/blk-zoned.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
--- a/block/blk-zoned.c
|
||||
+++ b/block/blk-zoned.c
|
||||
@@ -1225,6 +1225,7 @@ void blk_zone_write_plug_bio_endio(struc
|
||||
if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
|
||||
bio->bi_opf &= ~REQ_OP_MASK;
|
||||
bio->bi_opf |= REQ_OP_ZONE_APPEND;
|
||||
+ bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
|
||||
}
|
||||
|
||||
/*
|
65
debian/patches/patchset-pf/fixes/0040-block-use-plug-request-list-tail-for-one-shot-backme.patch
vendored
Normal file
65
debian/patches/patchset-pf/fixes/0040-block-use-plug-request-list-tail-for-one-shot-backme.patch
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
From 56ae62470a95ac8249c43f5c0d50da2a83c350e0 Mon Sep 17 00:00:00 2001
|
||||
From: Jens Axboe <axboe@kernel.dk>
|
||||
Date: Wed, 11 Jun 2025 08:48:46 -0600
|
||||
Subject: block: use plug request list tail for one-shot backmerge attempt
|
||||
|
||||
Previously, the block layer stored the requests in the plug list in
|
||||
LIFO order. For this reason, blk_attempt_plug_merge() would check
|
||||
just the head entry for a back merge attempt, and abort after that
|
||||
unless requests for multiple queues existed in the plug list. If more
|
||||
than one request is present in the plug list, this makes the one-shot
|
||||
back merging less useful than before, as it'll always fail to find a
|
||||
quick merge candidate.
|
||||
|
||||
Use the tail entry for the one-shot merge attempt, which is the last
|
||||
added request in the list. If that fails, abort immediately unless
|
||||
there are multiple queues available. If multiple queues are available,
|
||||
then scan the list. Ideally the latter scan would be a backwards scan
|
||||
of the list, but as it currently stands, the plug list is singly linked
|
||||
and hence this isn't easily feasible.
|
||||
|
||||
Cc: stable@vger.kernel.org
|
||||
Link: https://lore.kernel.org/linux-block/20250611121626.7252-1-abuehaze@amazon.com/
|
||||
Reported-by: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
|
||||
Fixes: e70c301faece ("block: don't reorder requests in blk_add_rq_to_plug")
|
||||
Signed-off-by: Jens Axboe <axboe@kernel.dk>
|
||||
---
|
||||
block/blk-merge.c | 26 +++++++++++++-------------
|
||||
1 file changed, 13 insertions(+), 13 deletions(-)
|
||||
|
||||
--- a/block/blk-merge.c
|
||||
+++ b/block/blk-merge.c
|
||||
@@ -1127,20 +1127,20 @@ bool blk_attempt_plug_merge(struct reque
|
||||
if (!plug || rq_list_empty(&plug->mq_list))
|
||||
return false;
|
||||
|
||||
- rq_list_for_each(&plug->mq_list, rq) {
|
||||
- if (rq->q == q) {
|
||||
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
- BIO_MERGE_OK)
|
||||
- return true;
|
||||
- break;
|
||||
- }
|
||||
+ rq = plug->mq_list.tail;
|
||||
+ if (rq->q == q)
|
||||
+ return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
+ BIO_MERGE_OK;
|
||||
+ else if (!plug->multiple_queues)
|
||||
+ return false;
|
||||
|
||||
- /*
|
||||
- * Only keep iterating plug list for merges if we have multiple
|
||||
- * queues
|
||||
- */
|
||||
- if (!plug->multiple_queues)
|
||||
- break;
|
||||
+ rq_list_for_each(&plug->mq_list, rq) {
|
||||
+ if (rq->q != q)
|
||||
+ continue;
|
||||
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
+ BIO_MERGE_OK)
|
||||
+ return true;
|
||||
+ break;
|
||||
}
|
||||
return false;
|
||||
}
|
@@ -1,89 +0,0 @@
|
||||
From aadea0887cca5739137f109eab0e1b38604c8af8 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Thu, 13 Feb 2025 11:13:53 -0500
|
||||
Subject: x86/mm: Remove pv_ops.mmu.tlb_remove_table call
|
||||
|
||||
Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table.
|
||||
|
||||
Get rid of the indirection by simply calling tlb_remove_table directly,
|
||||
and not going through the paravirt function pointers.
|
||||
|
||||
Suggested-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||||
Tested-by: Manali Shukla <Manali.Shukla@amd.com>
|
||||
Tested-by: Brendan Jackman <jackmanb@google.com>
|
||||
Tested-by: Michael Kelley <mhklinux@outlook.com>
|
||||
Link: https://lore.kernel.org/r/20250213161423.449435-3-riel@surriel.com
|
||||
---
|
||||
arch/x86/hyperv/mmu.c | 1 -
|
||||
arch/x86/include/asm/paravirt.h | 5 -----
|
||||
arch/x86/include/asm/paravirt_types.h | 2 --
|
||||
arch/x86/kernel/kvm.c | 1 -
|
||||
arch/x86/kernel/paravirt.c | 1 -
|
||||
arch/x86/xen/mmu_pv.c | 1 -
|
||||
6 files changed, 11 deletions(-)
|
||||
|
||||
--- a/arch/x86/hyperv/mmu.c
|
||||
+++ b/arch/x86/hyperv/mmu.c
|
||||
@@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void)
|
||||
|
||||
pr_info("Using hypercall for remote TLB flush\n");
|
||||
pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
|
||||
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
|
||||
}
|
||||
--- a/arch/x86/include/asm/paravirt.h
|
||||
+++ b/arch/x86/include/asm/paravirt.h
|
||||
@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(con
|
||||
PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
|
||||
}
|
||||
|
||||
-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
|
||||
-{
|
||||
- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
|
||||
-}
|
||||
-
|
||||
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
|
||||
{
|
||||
PVOP_VCALL1(mmu.exit_mmap, mm);
|
||||
--- a/arch/x86/include/asm/paravirt_types.h
|
||||
+++ b/arch/x86/include/asm/paravirt_types.h
|
||||
@@ -133,8 +133,6 @@ struct pv_mmu_ops {
|
||||
void (*flush_tlb_multi)(const struct cpumask *cpus,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
|
||||
-
|
||||
/* Hook for intercepting the destruction of an mm_struct. */
|
||||
void (*exit_mmap)(struct mm_struct *mm);
|
||||
void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc);
|
||||
--- a/arch/x86/kernel/kvm.c
|
||||
+++ b/arch/x86/kernel/kvm.c
|
||||
@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void)
|
||||
#ifdef CONFIG_SMP
|
||||
if (pv_tlb_flush_supported()) {
|
||||
pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
|
||||
- pv_ops.mmu.tlb_remove_table = tlb_remove_table;
|
||||
pr_info("KVM setup pv remote TLB flush\n");
|
||||
}
|
||||
|
||||
--- a/arch/x86/kernel/paravirt.c
|
||||
+++ b/arch/x86/kernel/paravirt.c
|
||||
@@ -182,7 +182,6 @@ struct paravirt_patch_template pv_ops =
|
||||
.mmu.flush_tlb_kernel = native_flush_tlb_global,
|
||||
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
|
||||
.mmu.flush_tlb_multi = native_flush_tlb_multi,
|
||||
- .mmu.tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.mmu.exit_mmap = paravirt_nop,
|
||||
.mmu.notify_page_enc_status_changed = paravirt_nop,
|
||||
--- a/arch/x86/xen/mmu_pv.c
|
||||
+++ b/arch/x86/xen/mmu_pv.c
|
||||
@@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops
|
||||
.flush_tlb_kernel = xen_flush_tlb,
|
||||
.flush_tlb_one_user = xen_flush_tlb_one_user,
|
||||
.flush_tlb_multi = xen_flush_tlb_multi,
|
||||
- .tlb_remove_table = tlb_remove_table,
|
||||
|
||||
.pgd_alloc = xen_pgd_alloc,
|
||||
.pgd_free = xen_pgd_free,
|
@@ -1,87 +0,0 @@
|
||||
From 170f37d1499a28f7a1902e007111867c7cf0147f Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:36 -0500
|
||||
Subject: x86/mm: Consolidate full flush threshold decision
|
||||
|
||||
Reduce code duplication by consolidating the decision point for whether to do
|
||||
individual invalidations or a full flush inside get_flush_tlb_info().
|
||||
|
||||
Suggested-by: Dave Hansen <dave.hansen@intel.com>
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Acked-by: Dave Hansen <dave.hansen@intel.com>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-2-riel@surriel.com
|
||||
---
|
||||
arch/x86/mm/tlb.c | 41 +++++++++++++++++++----------------------
|
||||
1 file changed, 19 insertions(+), 22 deletions(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1019,6 +1019,15 @@ static struct flush_tlb_info *get_flush_
|
||||
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
|
||||
#endif
|
||||
|
||||
+ /*
|
||||
+ * If the number of flushes is so large that a full flush
|
||||
+ * would be faster, do a full flush.
|
||||
+ */
|
||||
+ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
|
||||
+ start = 0;
|
||||
+ end = TLB_FLUSH_ALL;
|
||||
+ }
|
||||
+
|
||||
info->start = start;
|
||||
info->end = end;
|
||||
info->mm = mm;
|
||||
@@ -1045,17 +1054,8 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
bool freed_tables)
|
||||
{
|
||||
struct flush_tlb_info *info;
|
||||
+ int cpu = get_cpu();
|
||||
u64 new_tlb_gen;
|
||||
- int cpu;
|
||||
-
|
||||
- cpu = get_cpu();
|
||||
-
|
||||
- /* Should we flush just the requested range? */
|
||||
- if ((end == TLB_FLUSH_ALL) ||
|
||||
- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
|
||||
- start = 0;
|
||||
- end = TLB_FLUSH_ALL;
|
||||
- }
|
||||
|
||||
/* This is also a barrier that synchronizes with switch_mm(). */
|
||||
new_tlb_gen = inc_mm_tlb_gen(mm);
|
||||
@@ -1108,22 +1108,19 @@ static void do_kernel_range_flush(void *
|
||||
|
||||
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
{
|
||||
- /* Balance as user space task's flush, a bit conservative */
|
||||
- if (end == TLB_FLUSH_ALL ||
|
||||
- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
|
||||
- on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
- } else {
|
||||
- struct flush_tlb_info *info;
|
||||
+ struct flush_tlb_info *info;
|
||||
|
||||
- preempt_disable();
|
||||
- info = get_flush_tlb_info(NULL, start, end, 0, false,
|
||||
- TLB_GENERATION_INVALID);
|
||||
+ guard(preempt)();
|
||||
|
||||
+ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false,
|
||||
+ TLB_GENERATION_INVALID);
|
||||
+
|
||||
+ if (info->end == TLB_FLUSH_ALL)
|
||||
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+ else
|
||||
on_each_cpu(do_kernel_range_flush, info, 1);
|
||||
|
||||
- put_flush_tlb_info();
|
||||
- preempt_enable();
|
||||
- }
|
||||
+ put_flush_tlb_info();
|
||||
}
|
||||
|
||||
/*
|
@@ -1,103 +0,0 @@
|
||||
From acb5a284db4fa3dbbb246ab8fa58da0143cd68ce Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:37 -0500
|
||||
Subject: x86/mm: Add INVLPGB feature and Kconfig entry
|
||||
|
||||
In addition, the CPU advertises the maximum number of pages that can be
|
||||
shot down with one INVLPGB instruction in CPUID. Save that information
|
||||
for later use.
|
||||
|
||||
[ bp: use cpu_has(), typos, massage. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-3-riel@surriel.com
|
||||
---
|
||||
arch/x86/Kconfig.cpu | 4 ++++
|
||||
arch/x86/include/asm/cpufeatures.h | 1 +
|
||||
arch/x86/include/asm/disabled-features.h | 8 +++++++-
|
||||
arch/x86/include/asm/tlbflush.h | 3 +++
|
||||
arch/x86/kernel/cpu/amd.c | 6 ++++++
|
||||
5 files changed, 21 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/Kconfig.cpu
|
||||
+++ b/arch/x86/Kconfig.cpu
|
||||
@@ -740,6 +740,10 @@ menuconfig PROCESSOR_SELECT
|
||||
This lets you choose what x86 vendor support code your kernel
|
||||
will include.
|
||||
|
||||
+config BROADCAST_TLB_FLUSH
|
||||
+ def_bool y
|
||||
+ depends on CPU_SUP_AMD && 64BIT
|
||||
+
|
||||
config CPU_SUP_INTEL
|
||||
default y
|
||||
bool "Support Intel processors" if PROCESSOR_SELECT
|
||||
--- a/arch/x86/include/asm/cpufeatures.h
|
||||
+++ b/arch/x86/include/asm/cpufeatures.h
|
||||
@@ -338,6 +338,7 @@
|
||||
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
|
||||
#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
|
||||
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
|
||||
+#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */
|
||||
#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
|
||||
#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
|
||||
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
|
||||
--- a/arch/x86/include/asm/disabled-features.h
|
||||
+++ b/arch/x86/include/asm/disabled-features.h
|
||||
@@ -129,6 +129,12 @@
|
||||
#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31))
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+#define DISABLE_INVLPGB 0
|
||||
+#else
|
||||
+#define DISABLE_INVLPGB (1 << (X86_FEATURE_INVLPGB & 31))
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Make sure to add features to the correct mask
|
||||
*/
|
||||
@@ -146,7 +152,7 @@
|
||||
#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
|
||||
DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
|
||||
#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM)
|
||||
-#define DISABLED_MASK13 0
|
||||
+#define DISABLED_MASK13 (DISABLE_INVLPGB)
|
||||
#define DISABLED_MASK14 0
|
||||
#define DISABLED_MASK15 0
|
||||
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -183,6 +183,9 @@ static inline void cr4_init_shadow(void)
|
||||
extern unsigned long mmu_cr4_features;
|
||||
extern u32 *trampoline_cr4_features;
|
||||
|
||||
+/* How many pages can be invalidated with one INVLPGB. */
|
||||
+extern u16 invlpgb_count_max;
|
||||
+
|
||||
extern void initialize_tlbstate_and_flush(void);
|
||||
|
||||
/*
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -29,6 +29,8 @@
|
||||
|
||||
#include "cpu.h"
|
||||
|
||||
+u16 invlpgb_count_max __ro_after_init;
|
||||
+
|
||||
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
|
||||
{
|
||||
u32 gprs[8] = { 0 };
|
||||
@@ -1145,6 +1147,10 @@ static void cpu_detect_tlb_amd(struct cp
|
||||
tlb_lli_2m[ENTRIES] = eax & mask;
|
||||
|
||||
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
|
||||
+
|
||||
+ /* Max number of pages INVLPGB can invalidate in one shot */
|
||||
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
|
||||
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
|
||||
}
|
||||
|
||||
static const struct cpu_dev amd_cpu_dev = {
|
@@ -1,170 +0,0 @@
|
||||
From 27bab4a6ed6ee7b7b0e2d216b8802800ef26b2ad Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Fri, 28 Feb 2025 20:32:30 +0100
|
||||
Subject: x86/mm: Add INVLPGB support code
|
||||
|
||||
Add helper functions and definitions needed to use broadcast TLB
|
||||
invalidation on AMD CPUs.
|
||||
|
||||
[ bp:
|
||||
- Cleanup commit message
|
||||
- Improve and expand comments
|
||||
- push the preemption guards inside the invlpgb* helpers
|
||||
- merge improvements from dhansen
|
||||
- add !CONFIG_BROADCAST_TLB_FLUSH function stubs because Clang
|
||||
can't do DCE properly yet and looks at the inline asm and
|
||||
complains about it getting a u64 argument on 32-bit code ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-4-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/tlb.h | 132 +++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 132 insertions(+)
|
||||
|
||||
--- a/arch/x86/include/asm/tlb.h
|
||||
+++ b/arch/x86/include/asm/tlb.h
|
||||
@@ -6,6 +6,9 @@
|
||||
static inline void tlb_flush(struct mmu_gather *tlb);
|
||||
|
||||
#include <asm-generic/tlb.h>
|
||||
+#include <linux/kernel.h>
|
||||
+#include <vdso/bits.h>
|
||||
+#include <vdso/page.h>
|
||||
|
||||
static inline void tlb_flush(struct mmu_gather *tlb)
|
||||
{
|
||||
@@ -25,4 +28,133 @@ static inline void invlpg(unsigned long
|
||||
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
||||
}
|
||||
|
||||
+enum addr_stride {
|
||||
+ PTE_STRIDE = 0,
|
||||
+ PMD_STRIDE = 1
|
||||
+};
|
||||
+
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+/*
|
||||
+ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
|
||||
+ *
|
||||
+ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
|
||||
+ * be done in a parallel fashion.
|
||||
+ *
|
||||
+ * The instruction takes the number of extra pages to invalidate, beyond
|
||||
+ * the first page, while __invlpgb gets the more human readable number of
|
||||
+ * pages to invalidate.
|
||||
+ *
|
||||
+ * The bits in rax[0:2] determine respectively which components of the address
|
||||
+ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
|
||||
+ * address in the specified range matches.
|
||||
+ *
|
||||
+ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
|
||||
+ * this CPU have completed.
|
||||
+ */
|
||||
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
|
||||
+ unsigned long addr, u16 nr_pages,
|
||||
+ enum addr_stride stride, u8 flags)
|
||||
+{
|
||||
+ u32 edx = (pcid << 16) | asid;
|
||||
+ u32 ecx = (stride << 31) | (nr_pages - 1);
|
||||
+ u64 rax = addr | flags;
|
||||
+
|
||||
+ /* The low bits in rax are for flags. Verify addr is clean. */
|
||||
+ VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
|
||||
+
|
||||
+ /* INVLPGB; supported in binutils >= 2.36. */
|
||||
+ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx));
|
||||
+}
|
||||
+
|
||||
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags)
|
||||
+{
|
||||
+ __invlpgb(asid, pcid, 0, 1, 0, flags);
|
||||
+}
|
||||
+
|
||||
+static inline void __tlbsync(void)
|
||||
+{
|
||||
+ /*
|
||||
+ * TLBSYNC waits for INVLPGB instructions originating on the same CPU
|
||||
+ * to have completed. Print a warning if the task has been migrated,
|
||||
+ * and might not be waiting on all the INVLPGBs issued during this TLB
|
||||
+ * invalidation sequence.
|
||||
+ */
|
||||
+ cant_migrate();
|
||||
+
|
||||
+ /* TLBSYNC: supported in binutils >= 0.36. */
|
||||
+ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory");
|
||||
+}
|
||||
+#else
|
||||
+/* Some compilers (I'm looking at you clang!) simply can't do DCE */
|
||||
+static inline void __invlpgb(unsigned long asid, unsigned long pcid,
|
||||
+ unsigned long addr, u16 nr_pages,
|
||||
+ enum addr_stride s, u8 flags) { }
|
||||
+static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { }
|
||||
+static inline void __tlbsync(void) { }
|
||||
+#endif
|
||||
+
|
||||
+/*
|
||||
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
|
||||
+ * of the three. For example:
|
||||
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
|
||||
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
|
||||
+ *
|
||||
+ * The first is used to invalidate (kernel) mappings at a particular
|
||||
+ * address across all processes.
|
||||
+ *
|
||||
+ * The latter invalidates all TLB entries matching a PCID.
|
||||
+ */
|
||||
+#define INVLPGB_FLAG_VA BIT(0)
|
||||
+#define INVLPGB_FLAG_PCID BIT(1)
|
||||
+#define INVLPGB_FLAG_ASID BIT(2)
|
||||
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
|
||||
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
|
||||
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
|
||||
+
|
||||
+/* The implied mode when all bits are clear: */
|
||||
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
|
||||
+
|
||||
+static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
|
||||
+ unsigned long addr,
|
||||
+ u16 nr, bool stride)
|
||||
+{
|
||||
+ enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE;
|
||||
+ u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA;
|
||||
+
|
||||
+ __invlpgb(0, pcid, addr, nr, str, flags);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings for a given PCID, not including globals. */
|
||||
+static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
|
||||
+{
|
||||
+ __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings, including globals, for all PCIDs. */
|
||||
+static inline void invlpgb_flush_all(void)
|
||||
+{
|
||||
+ /*
|
||||
+ * TLBSYNC at the end needs to make sure all flushes done on the
|
||||
+ * current CPU have been executed system-wide. Therefore, make
|
||||
+ * sure nothing gets migrated in-between but disable preemption
|
||||
+ * as it is cheaper.
|
||||
+ */
|
||||
+ guard(preempt)();
|
||||
+ __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL);
|
||||
+ __tlbsync();
|
||||
+}
|
||||
+
|
||||
+/* Flush addr, including globals, for all PCIDs. */
|
||||
+static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr)
|
||||
+{
|
||||
+ __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL);
|
||||
+}
|
||||
+
|
||||
+/* Flush all mappings for all PCIDs except globals. */
|
||||
+static inline void invlpgb_flush_all_nonglobals(void)
|
||||
+{
|
||||
+ guard(preempt)();
|
||||
+ __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS);
|
||||
+ __tlbsync();
|
||||
+}
|
||||
#endif /* _ASM_X86_TLB_H */
|
@@ -1,97 +0,0 @@
|
||||
From 358d71638f420efe8f7e05ce74aefe13e9320283 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:39 -0500
|
||||
Subject: x86/mm: Use INVLPGB for kernel TLB flushes
|
||||
|
||||
Use broadcast TLB invalidation for kernel addresses when available.
|
||||
Remove the need to send IPIs for kernel TLB flushes.
|
||||
|
||||
[ bp: Integrate dhansen's comments additions, merge the
|
||||
flush_tlb_all() change into this one too. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-5-riel@surriel.com
|
||||
---
|
||||
arch/x86/mm/tlb.c | 48 +++++++++++++++++++++++++++++++++++++++++++----
|
||||
1 file changed, 44 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1083,7 +1083,6 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
|
||||
}
|
||||
|
||||
-
|
||||
static void do_flush_tlb_all(void *info)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
||||
@@ -1093,7 +1092,32 @@ static void do_flush_tlb_all(void *info)
|
||||
void flush_tlb_all(void)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
||||
- on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+
|
||||
+ /* First try (faster) hardware-assisted TLB invalidation. */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ invlpgb_flush_all();
|
||||
+ else
|
||||
+ /* Fall back to the IPI-based invalidation. */
|
||||
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+}
|
||||
+
|
||||
+/* Flush an arbitrarily large range of memory with INVLPGB. */
|
||||
+static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ unsigned long addr, nr;
|
||||
+
|
||||
+ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
|
||||
+ nr = (info->end - addr) >> PAGE_SHIFT;
|
||||
+
|
||||
+ /*
|
||||
+ * INVLPGB has a limit on the size of ranges it can
|
||||
+ * flush. Break up large flushes.
|
||||
+ */
|
||||
+ nr = clamp_val(nr, 1, invlpgb_count_max);
|
||||
+
|
||||
+ invlpgb_flush_addr_nosync(addr, nr);
|
||||
+ }
|
||||
+ __tlbsync();
|
||||
}
|
||||
|
||||
static void do_kernel_range_flush(void *info)
|
||||
@@ -1106,6 +1130,22 @@ static void do_kernel_range_flush(void *
|
||||
flush_tlb_one_kernel(addr);
|
||||
}
|
||||
|
||||
+static void kernel_tlb_flush_all(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ invlpgb_flush_all();
|
||||
+ else
|
||||
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+}
|
||||
+
|
||||
+static void kernel_tlb_flush_range(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ invlpgb_kernel_range_flush(info);
|
||||
+ else
|
||||
+ on_each_cpu(do_kernel_range_flush, info, 1);
|
||||
+}
|
||||
+
|
||||
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
{
|
||||
struct flush_tlb_info *info;
|
||||
@@ -1116,9 +1156,9 @@ void flush_tlb_kernel_range(unsigned lon
|
||||
TLB_GENERATION_INVALID);
|
||||
|
||||
if (info->end == TLB_FLUSH_ALL)
|
||||
- on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
+ kernel_tlb_flush_all(info);
|
||||
else
|
||||
- on_each_cpu(do_kernel_range_flush, info, 1);
|
||||
+ kernel_tlb_flush_range(info);
|
||||
|
||||
put_flush_tlb_info();
|
||||
}
|
@@ -1,32 +0,0 @@
|
||||
From 7cf099de79e12d6c4949f733c8cbb241bb08f07a Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:41 -0500
|
||||
Subject: x86/mm: Use broadcast TLB flushing in page reclaim
|
||||
|
||||
Page reclaim tracks only the CPU(s) where the TLB needs to be flushed, rather
|
||||
than all the individual mappings that may be getting invalidated.
|
||||
|
||||
Use broadcast TLB flushing when that is available.
|
||||
|
||||
[ bp: Massage commit message. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-7-riel@surriel.com
|
||||
---
|
||||
arch/x86/mm/tlb.c | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1339,7 +1339,9 @@ void arch_tlbbatch_flush(struct arch_tlb
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ invlpgb_flush_all_nonglobals();
|
||||
+ } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
flush_tlb_multi(&batch->cpumask, info);
|
||||
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
||||
lockdep_assert_irqs_enabled();
|
@@ -1,286 +0,0 @@
|
||||
From f9ecaaca7ac26789d7d3e0d8022b7c99599dc8a3 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:42 -0500
|
||||
Subject: x86/mm: Add global ASID allocation helper functions
|
||||
|
||||
Add functions to manage global ASID space. Multithreaded processes that are
|
||||
simultaneously active on 4 or more CPUs can get a global ASID, resulting in the
|
||||
same PCID being used for that process on every CPU.
|
||||
|
||||
This in turn will allow the kernel to use hardware-assisted TLB flushing
|
||||
through AMD INVLPGB or Intel RAR for these processes.
|
||||
|
||||
[ bp:
|
||||
- Extend use_global_asid() comment
|
||||
- s/X86_BROADCAST_TLB_FLUSH/BROADCAST_TLB_FLUSH/g
|
||||
- other touchups ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-8-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/mmu.h | 12 +++
|
||||
arch/x86/include/asm/mmu_context.h | 2 +
|
||||
arch/x86/include/asm/tlbflush.h | 37 +++++++
|
||||
arch/x86/mm/tlb.c | 154 ++++++++++++++++++++++++++++-
|
||||
4 files changed, 202 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/mmu.h
|
||||
+++ b/arch/x86/include/asm/mmu.h
|
||||
@@ -69,6 +69,18 @@ typedef struct {
|
||||
u16 pkey_allocation_map;
|
||||
s16 execute_only_pkey;
|
||||
#endif
|
||||
+
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+ /*
|
||||
+ * The global ASID will be a non-zero value when the process has
|
||||
+ * the same ASID across all CPUs, allowing it to make use of
|
||||
+ * hardware-assisted remote TLB invalidation like AMD INVLPGB.
|
||||
+ */
|
||||
+ u16 global_asid;
|
||||
+
|
||||
+ /* The process is transitioning to a new global ASID number. */
|
||||
+ bool asid_transition;
|
||||
+#endif
|
||||
} mm_context_t;
|
||||
|
||||
#define INIT_MM_CONTEXT(mm) \
|
||||
--- a/arch/x86/include/asm/mmu_context.h
|
||||
+++ b/arch/x86/include/asm/mmu_context.h
|
||||
@@ -139,6 +139,8 @@ static inline void mm_reset_untag_mask(s
|
||||
#define enter_lazy_tlb enter_lazy_tlb
|
||||
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
|
||||
|
||||
+extern void mm_free_global_asid(struct mm_struct *mm);
|
||||
+
|
||||
/*
|
||||
* Init a new mm. Used on mm copies, like at fork()
|
||||
* and on mm's that are brand-new, like at execve().
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -6,6 +6,7 @@
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
+#include <asm/barrier.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/special_insns.h>
|
||||
@@ -234,6 +235,42 @@ void flush_tlb_one_kernel(unsigned long
|
||||
void flush_tlb_multi(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info);
|
||||
|
||||
+static inline bool is_dyn_asid(u16 asid)
|
||||
+{
|
||||
+ return asid < TLB_NR_DYN_ASIDS;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+static inline u16 mm_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ u16 asid;
|
||||
+
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return 0;
|
||||
+
|
||||
+ asid = smp_load_acquire(&mm->context.global_asid);
|
||||
+
|
||||
+ /* mm->context.global_asid is either 0, or a global ASID */
|
||||
+ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid));
|
||||
+
|
||||
+ return asid;
|
||||
+}
|
||||
+
|
||||
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
|
||||
+{
|
||||
+ /*
|
||||
+ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() ->
|
||||
+ * finish_asid_transition() needs to observe asid_transition = true
|
||||
+ * once it observes global_asid.
|
||||
+ */
|
||||
+ mm->context.asid_transition = true;
|
||||
+ smp_store_release(&mm->context.global_asid, asid);
|
||||
+}
|
||||
+#else
|
||||
+static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
|
||||
+static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
|
||||
+#endif /* CONFIG_BROADCAST_TLB_FLUSH */
|
||||
+
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
#include <asm/paravirt.h>
|
||||
#endif
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -74,13 +74,15 @@
|
||||
* use different names for each of them:
|
||||
*
|
||||
* ASID - [0, TLB_NR_DYN_ASIDS-1]
|
||||
- * the canonical identifier for an mm
|
||||
+ * the canonical identifier for an mm, dynamically allocated on each CPU
|
||||
+ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
|
||||
+ * the canonical, global identifier for an mm, identical across all CPUs
|
||||
*
|
||||
- * kPCID - [1, TLB_NR_DYN_ASIDS]
|
||||
+ * kPCID - [1, MAX_ASID_AVAILABLE]
|
||||
* the value we write into the PCID part of CR3; corresponds to the
|
||||
* ASID+1, because PCID 0 is special.
|
||||
*
|
||||
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
|
||||
+ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
|
||||
* for KPTI each mm has two address spaces and thus needs two
|
||||
* PCID values, but we can still do with a single ASID denomination
|
||||
* for each mm. Corresponds to kPCID + 2048.
|
||||
@@ -252,6 +254,152 @@ static void choose_new_asid(struct mm_st
|
||||
}
|
||||
|
||||
/*
|
||||
+ * Global ASIDs are allocated for multi-threaded processes that are
|
||||
+ * active on multiple CPUs simultaneously, giving each of those
|
||||
+ * processes the same PCID on every CPU, for use with hardware-assisted
|
||||
+ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
|
||||
+ *
|
||||
+ * These global ASIDs are held for the lifetime of the process.
|
||||
+ */
|
||||
+static DEFINE_RAW_SPINLOCK(global_asid_lock);
|
||||
+static u16 last_global_asid = MAX_ASID_AVAILABLE;
|
||||
+static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
|
||||
+static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
|
||||
+static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
|
||||
+
|
||||
+/*
|
||||
+ * When the search for a free ASID in the global ASID space reaches
|
||||
+ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
|
||||
+ * freed global ASIDs are safe to re-use.
|
||||
+ *
|
||||
+ * This way the global flush only needs to happen at ASID rollover
|
||||
+ * time, and not at ASID allocation time.
|
||||
+ */
|
||||
+static void reset_global_asid_space(void)
|
||||
+{
|
||||
+ lockdep_assert_held(&global_asid_lock);
|
||||
+
|
||||
+ invlpgb_flush_all_nonglobals();
|
||||
+
|
||||
+ /*
|
||||
+ * The TLB flush above makes it safe to re-use the previously
|
||||
+ * freed global ASIDs.
|
||||
+ */
|
||||
+ bitmap_andnot(global_asid_used, global_asid_used,
|
||||
+ global_asid_freed, MAX_ASID_AVAILABLE);
|
||||
+ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE);
|
||||
+
|
||||
+ /* Restart the search from the start of global ASID space. */
|
||||
+ last_global_asid = TLB_NR_DYN_ASIDS;
|
||||
+}
|
||||
+
|
||||
+static u16 allocate_global_asid(void)
|
||||
+{
|
||||
+ u16 asid;
|
||||
+
|
||||
+ lockdep_assert_held(&global_asid_lock);
|
||||
+
|
||||
+ /* The previous allocation hit the edge of available address space */
|
||||
+ if (last_global_asid >= MAX_ASID_AVAILABLE - 1)
|
||||
+ reset_global_asid_space();
|
||||
+
|
||||
+ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid);
|
||||
+
|
||||
+ if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
|
||||
+ /* This should never happen. */
|
||||
+ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n",
|
||||
+ global_asid_available);
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ /* Claim this global ASID. */
|
||||
+ __set_bit(asid, global_asid_used);
|
||||
+ last_global_asid = asid;
|
||||
+ global_asid_available--;
|
||||
+ return asid;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Check whether a process is currently active on more than @threshold CPUs.
|
||||
+ * This is a cheap estimation on whether or not it may make sense to assign
|
||||
+ * a global ASID to this process, and use broadcast TLB invalidation.
|
||||
+ */
|
||||
+static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold)
|
||||
+{
|
||||
+ int count = 0;
|
||||
+ int cpu;
|
||||
+
|
||||
+ /* This quick check should eliminate most single threaded programs. */
|
||||
+ if (cpumask_weight(mm_cpumask(mm)) <= threshold)
|
||||
+ return false;
|
||||
+
|
||||
+ /* Slower check to make sure. */
|
||||
+ for_each_cpu(cpu, mm_cpumask(mm)) {
|
||||
+ /* Skip the CPUs that aren't really running this process. */
|
||||
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
|
||||
+ continue;
|
||||
+
|
||||
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
|
||||
+ continue;
|
||||
+
|
||||
+ if (++count > threshold)
|
||||
+ return true;
|
||||
+ }
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Assign a global ASID to the current process, protecting against
|
||||
+ * races between multiple threads in the process.
|
||||
+ */
|
||||
+static void use_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ u16 asid;
|
||||
+
|
||||
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
|
||||
+
|
||||
+ /* This process is already using broadcast TLB invalidation. */
|
||||
+ if (mm_global_asid(mm))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * The last global ASID was consumed while waiting for the lock.
|
||||
+ *
|
||||
+ * If this fires, a more aggressive ASID reuse scheme might be
|
||||
+ * needed.
|
||||
+ */
|
||||
+ if (!global_asid_available) {
|
||||
+ VM_WARN_ONCE(1, "Ran out of global ASIDs\n");
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ asid = allocate_global_asid();
|
||||
+ if (!asid)
|
||||
+ return;
|
||||
+
|
||||
+ mm_assign_global_asid(mm, asid);
|
||||
+}
|
||||
+
|
||||
+void mm_free_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return;
|
||||
+
|
||||
+ if (!mm_global_asid(mm))
|
||||
+ return;
|
||||
+
|
||||
+ guard(raw_spinlock_irqsave)(&global_asid_lock);
|
||||
+
|
||||
+ /* The global ASID can be re-used only after flush at wrap-around. */
|
||||
+#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
+ __set_bit(mm->context.global_asid, global_asid_freed);
|
||||
+
|
||||
+ mm->context.global_asid = 0;
|
||||
+ global_asid_available++;
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
*
|
@@ -1,219 +0,0 @@
|
||||
From b56070b9f121507cabe352e03f0c534db2d5adc7 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:43 -0500
|
||||
Subject: x86/mm: Handle global ASID context switch and TLB flush
|
||||
|
||||
Do context switch and TLB flush support for processes that use a global
|
||||
ASID and PCID across all CPUs.
|
||||
|
||||
At both context switch time and TLB flush time, it needs to be checked whether
|
||||
a task is switching to a global ASID, and, if so, reload the TLB with the new
|
||||
ASID as appropriate.
|
||||
|
||||
In both code paths, the TLB flush is avoided if a global ASID is used, because
|
||||
the global ASIDs are always kept up to date across CPUs, even when the
|
||||
process is not running on a CPU.
|
||||
|
||||
[ bp:
|
||||
- Massage
|
||||
- :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
|
||||
]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-9-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/tlbflush.h | 14 ++++++
|
||||
arch/x86/mm/tlb.c | 77 ++++++++++++++++++++++++++++++---
|
||||
2 files changed, 84 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -240,6 +240,11 @@ static inline bool is_dyn_asid(u16 asid)
|
||||
return asid < TLB_NR_DYN_ASIDS;
|
||||
}
|
||||
|
||||
+static inline bool is_global_asid(u16 asid)
|
||||
+{
|
||||
+ return !is_dyn_asid(asid);
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
static inline u16 mm_global_asid(struct mm_struct *mm)
|
||||
{
|
||||
@@ -266,9 +271,18 @@ static inline void mm_assign_global_asid
|
||||
mm->context.asid_transition = true;
|
||||
smp_store_release(&mm->context.global_asid, asid);
|
||||
}
|
||||
+
|
||||
+static inline bool mm_in_asid_transition(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return false;
|
||||
+
|
||||
+ return mm && READ_ONCE(mm->context.asid_transition);
|
||||
+}
|
||||
#else
|
||||
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
|
||||
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
|
||||
+static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
|
||||
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -227,6 +227,20 @@ static void choose_new_asid(struct mm_st
|
||||
return;
|
||||
}
|
||||
|
||||
+ /*
|
||||
+ * TLB consistency for global ASIDs is maintained with hardware assisted
|
||||
+ * remote TLB flushing. Global ASIDs are always up to date.
|
||||
+ */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ u16 global_asid = mm_global_asid(next);
|
||||
+
|
||||
+ if (global_asid) {
|
||||
+ *new_asid = global_asid;
|
||||
+ *need_flush = false;
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (this_cpu_read(cpu_tlbstate.invalidate_other))
|
||||
clear_asid_other();
|
||||
|
||||
@@ -400,6 +414,23 @@ void mm_free_global_asid(struct mm_struc
|
||||
}
|
||||
|
||||
/*
|
||||
+ * Is the mm transitioning from a CPU-local ASID to a global ASID?
|
||||
+ */
|
||||
+static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
|
||||
+{
|
||||
+ u16 global_asid = mm_global_asid(mm);
|
||||
+
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return false;
|
||||
+
|
||||
+ /* Process is transitioning to a global ASID */
|
||||
+ if (global_asid && asid != global_asid)
|
||||
+ return true;
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
*
|
||||
@@ -704,7 +735,8 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
*/
|
||||
if (prev == next) {
|
||||
/* Not actually switching mm's */
|
||||
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
+ VM_WARN_ON(is_dyn_asid(prev_asid) &&
|
||||
+ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
next->context.ctx_id);
|
||||
|
||||
/*
|
||||
@@ -721,6 +753,20 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
!cpumask_test_cpu(cpu, mm_cpumask(next))))
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
+ /* Check if the current mm is transitioning to a global ASID */
|
||||
+ if (mm_needs_global_asid(next, prev_asid)) {
|
||||
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
+ goto reload_tlb;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Broadcast TLB invalidation keeps this ASID up to date
|
||||
+ * all the time.
|
||||
+ */
|
||||
+ if (is_global_asid(prev_asid))
|
||||
+ return;
|
||||
+
|
||||
/*
|
||||
* If the CPU is not in lazy TLB mode, we are just switching
|
||||
* from one thread in a process to another thread in the same
|
||||
@@ -755,6 +801,13 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
cond_mitigation(tsk);
|
||||
|
||||
/*
|
||||
+ * Let nmi_uaccess_okay() and finish_asid_transition()
|
||||
+ * know that CR3 is changing.
|
||||
+ */
|
||||
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
||||
+ barrier();
|
||||
+
|
||||
+ /*
|
||||
* Leave this CPU in prev's mm_cpumask. Atomic writes to
|
||||
* mm_cpumask can be expensive under contention. The CPU
|
||||
* will be removed lazily at TLB flush time.
|
||||
@@ -768,18 +821,12 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
|
||||
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
-
|
||||
- /*
|
||||
- * Indicate that CR3 is about to change. nmi_uaccess_okay()
|
||||
- * and others are sensitive to the window where mm_cpumask(),
|
||||
- * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
|
||||
- */
|
||||
- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
||||
- barrier();
|
||||
}
|
||||
|
||||
+reload_tlb:
|
||||
new_lam = mm_lam_cr3_mask(next);
|
||||
if (need_flush) {
|
||||
+ VM_WARN_ON_ONCE(is_global_asid(new_asid));
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
|
||||
@@ -898,7 +945,7 @@ static void flush_tlb_func(void *info)
|
||||
const struct flush_tlb_info *f = info;
|
||||
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
||||
+ u64 local_tlb_gen;
|
||||
bool local = smp_processor_id() == f->initiating_cpu;
|
||||
unsigned long nr_invalidate = 0;
|
||||
u64 mm_tlb_gen;
|
||||
@@ -921,6 +968,16 @@ static void flush_tlb_func(void *info)
|
||||
if (unlikely(loaded_mm == &init_mm))
|
||||
return;
|
||||
|
||||
+ /* Reload the ASID if transitioning into or out of a global ASID */
|
||||
+ if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) {
|
||||
+ switch_mm_irqs_off(NULL, loaded_mm, NULL);
|
||||
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
+ }
|
||||
+
|
||||
+ /* Broadcast ASIDs are always kept up to date with INVLPGB. */
|
||||
+ if (is_global_asid(loaded_mm_asid))
|
||||
+ return;
|
||||
+
|
||||
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
|
||||
loaded_mm->context.ctx_id);
|
||||
|
||||
@@ -938,6 +995,8 @@ static void flush_tlb_func(void *info)
|
||||
return;
|
||||
}
|
||||
|
||||
+ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
|
||||
+
|
||||
if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
|
||||
f->new_tlb_gen <= local_tlb_gen)) {
|
||||
/*
|
||||
@@ -1120,7 +1179,7 @@ STATIC_NOPV void native_flush_tlb_multi(
|
||||
* up on the new contents of what used to be page tables, while
|
||||
* doing a speculative memory access.
|
||||
*/
|
||||
- if (info->freed_tables)
|
||||
+ if (info->freed_tables || mm_in_asid_transition(info->mm))
|
||||
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
|
||||
else
|
||||
on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
|
@@ -1,88 +0,0 @@
|
||||
From 6d3b8545e2c3c638363fb449a99b5a6cbab87a49 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:44 -0500
|
||||
Subject: x86/mm: Add global ASID process exit helpers
|
||||
|
||||
A global ASID is allocated for the lifetime of a process. Free the global ASID
|
||||
at process exit time.
|
||||
|
||||
[ bp: Massage, create helpers, hide details inside them. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-10-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/mmu_context.h | 8 +++++++-
|
||||
arch/x86/include/asm/tlbflush.h | 9 +++++++++
|
||||
2 files changed, 16 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/include/asm/mmu_context.h
|
||||
+++ b/arch/x86/include/asm/mmu_context.h
|
||||
@@ -2,7 +2,6 @@
|
||||
#ifndef _ASM_X86_MMU_CONTEXT_H
|
||||
#define _ASM_X86_MMU_CONTEXT_H
|
||||
|
||||
-#include <asm/desc.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/pkeys.h>
|
||||
@@ -13,6 +12,7 @@
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/debugreg.h>
|
||||
#include <asm/gsseg.h>
|
||||
+#include <asm/desc.h>
|
||||
|
||||
extern atomic64_t last_mm_ctx_id;
|
||||
|
||||
@@ -139,6 +139,9 @@ static inline void mm_reset_untag_mask(s
|
||||
#define enter_lazy_tlb enter_lazy_tlb
|
||||
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
|
||||
|
||||
+#define mm_init_global_asid mm_init_global_asid
|
||||
+extern void mm_init_global_asid(struct mm_struct *mm);
|
||||
+
|
||||
extern void mm_free_global_asid(struct mm_struct *mm);
|
||||
|
||||
/*
|
||||
@@ -163,6 +166,8 @@ static inline int init_new_context(struc
|
||||
mm->context.execute_only_pkey = -1;
|
||||
}
|
||||
#endif
|
||||
+
|
||||
+ mm_init_global_asid(mm);
|
||||
mm_reset_untag_mask(mm);
|
||||
init_new_context_ldt(mm);
|
||||
return 0;
|
||||
@@ -172,6 +177,7 @@ static inline int init_new_context(struc
|
||||
static inline void destroy_context(struct mm_struct *mm)
|
||||
{
|
||||
destroy_context_ldt(mm);
|
||||
+ mm_free_global_asid(mm);
|
||||
}
|
||||
|
||||
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -261,6 +261,14 @@ static inline u16 mm_global_asid(struct
|
||||
return asid;
|
||||
}
|
||||
|
||||
+static inline void mm_init_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ mm->context.global_asid = 0;
|
||||
+ mm->context.asid_transition = false;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid)
|
||||
{
|
||||
/*
|
||||
@@ -281,6 +289,7 @@ static inline bool mm_in_asid_transition
|
||||
}
|
||||
#else
|
||||
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
|
||||
+static inline void mm_init_global_asid(struct mm_struct *mm) { }
|
||||
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
|
||||
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
|
||||
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
|
@@ -1,219 +0,0 @@
|
||||
From 077e9ceb65f514ea63afc65cce86ce8677e77012 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:45 -0500
|
||||
Subject: x86/mm: Enable broadcast TLB invalidation for multi-threaded
|
||||
processes
|
||||
|
||||
There is not enough room in the 12-bit ASID address space to hand out
|
||||
broadcast ASIDs to every process. Only hand out broadcast ASIDs to processes
|
||||
when they are observed to be simultaneously running on 4 or more CPUs.
|
||||
|
||||
This also allows single threaded process to continue using the cheaper, local
|
||||
TLB invalidation instructions like INVLPGB.
|
||||
|
||||
Due to the structure of flush_tlb_mm_range(), the INVLPGB flushing is done in
|
||||
a generically named broadcast_tlb_flush() function which can later also be
|
||||
used for Intel RAR.
|
||||
|
||||
Combined with the removal of unnecessary lru_add_drain calls() (see
|
||||
https://lore.kernel.org/r/20241219153253.3da9e8aa@fangorn) this results in
|
||||
a nice performance boost for the will-it-scale tlb_flush2_threads test on an
|
||||
AMD Milan system with 36 cores:
|
||||
|
||||
- vanilla kernel: 527k loops/second
|
||||
- lru_add_drain removal: 731k loops/second
|
||||
- only INVLPGB: 527k loops/second
|
||||
- lru_add_drain + INVLPGB: 1157k loops/second
|
||||
|
||||
Profiling with only the INVLPGB changes showed while TLB invalidation went
|
||||
down from 40% of the total CPU time to only around 4% of CPU time, the
|
||||
contention simply moved to the LRU lock.
|
||||
|
||||
Fixing both at the same time about doubles the number of iterations per second
|
||||
from this case.
|
||||
|
||||
Comparing will-it-scale tlb_flush2_threads with several different numbers of
|
||||
threads on a 72 CPU AMD Milan shows similar results. The number represents the
|
||||
total number of loops per second across all the threads:
|
||||
|
||||
threads tip INVLPGB
|
||||
|
||||
1 315k 304k
|
||||
2 423k 424k
|
||||
4 644k 1032k
|
||||
8 652k 1267k
|
||||
16 737k 1368k
|
||||
32 759k 1199k
|
||||
64 636k 1094k
|
||||
72 609k 993k
|
||||
|
||||
1 and 2 thread performance is similar with and without INVLPGB, because
|
||||
INVLPGB is only used on processes using 4 or more CPUs simultaneously.
|
||||
|
||||
The number is the median across 5 runs.
|
||||
|
||||
Some numbers closer to real world performance can be found at Phoronix, thanks
|
||||
to Michael:
|
||||
|
||||
https://www.phoronix.com/news/AMD-INVLPGB-Linux-Benefits
|
||||
|
||||
[ bp:
|
||||
- Massage
|
||||
- :%s/\<static_cpu_has\>/cpu_feature_enabled/cgi
|
||||
- :%s/\<clear_asid_transition\>/mm_clear_asid_transition/cgi
|
||||
- Fold in a 0day bot fix: https://lore.kernel.org/oe-kbuild-all/202503040000.GtiWUsBm-lkp@intel.com
|
||||
]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-11-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/tlbflush.h | 6 ++
|
||||
arch/x86/mm/tlb.c | 104 +++++++++++++++++++++++++++++++-
|
||||
2 files changed, 109 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -280,6 +280,11 @@ static inline void mm_assign_global_asid
|
||||
smp_store_release(&mm->context.global_asid, asid);
|
||||
}
|
||||
|
||||
+static inline void mm_clear_asid_transition(struct mm_struct *mm)
|
||||
+{
|
||||
+ WRITE_ONCE(mm->context.asid_transition, false);
|
||||
+}
|
||||
+
|
||||
static inline bool mm_in_asid_transition(struct mm_struct *mm)
|
||||
{
|
||||
if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
@@ -291,6 +296,7 @@ static inline bool mm_in_asid_transition
|
||||
static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; }
|
||||
static inline void mm_init_global_asid(struct mm_struct *mm) { }
|
||||
static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { }
|
||||
+static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
|
||||
static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
|
||||
#endif /* CONFIG_BROADCAST_TLB_FLUSH */
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -431,6 +431,105 @@ static bool mm_needs_global_asid(struct
|
||||
}
|
||||
|
||||
/*
|
||||
+ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
|
||||
+ * systems have over 8k CPUs. Because of this potential ASID shortage,
|
||||
+ * global ASIDs are handed out to processes that have frequent TLB
|
||||
+ * flushes and are active on 4 or more CPUs simultaneously.
|
||||
+ */
|
||||
+static void consider_global_asid(struct mm_struct *mm)
|
||||
+{
|
||||
+ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
|
||||
+ return;
|
||||
+
|
||||
+ /* Check every once in a while. */
|
||||
+ if ((current->pid & 0x1f) != (jiffies & 0x1f))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Assign a global ASID if the process is active on
|
||||
+ * 4 or more CPUs simultaneously.
|
||||
+ */
|
||||
+ if (mm_active_cpus_exceeds(mm, 3))
|
||||
+ use_global_asid(mm);
|
||||
+}
|
||||
+
|
||||
+static void finish_asid_transition(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ struct mm_struct *mm = info->mm;
|
||||
+ int bc_asid = mm_global_asid(mm);
|
||||
+ int cpu;
|
||||
+
|
||||
+ if (!mm_in_asid_transition(mm))
|
||||
+ return;
|
||||
+
|
||||
+ for_each_cpu(cpu, mm_cpumask(mm)) {
|
||||
+ /*
|
||||
+ * The remote CPU is context switching. Wait for that to
|
||||
+ * finish, to catch the unlikely case of it switching to
|
||||
+ * the target mm with an out of date ASID.
|
||||
+ */
|
||||
+ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
|
||||
+ cpu_relax();
|
||||
+
|
||||
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
|
||||
+ continue;
|
||||
+
|
||||
+ /*
|
||||
+ * If at least one CPU is not using the global ASID yet,
|
||||
+ * send a TLB flush IPI. The IPI should cause stragglers
|
||||
+ * to transition soon.
|
||||
+ *
|
||||
+ * This can race with the CPU switching to another task;
|
||||
+ * that results in a (harmless) extra IPI.
|
||||
+ */
|
||||
+ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
|
||||
+ flush_tlb_multi(mm_cpumask(info->mm), info);
|
||||
+ return;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ /* All the CPUs running this process are using the global ASID. */
|
||||
+ mm_clear_asid_transition(mm);
|
||||
+}
|
||||
+
|
||||
+static void broadcast_tlb_flush(struct flush_tlb_info *info)
|
||||
+{
|
||||
+ bool pmd = info->stride_shift == PMD_SHIFT;
|
||||
+ unsigned long asid = mm_global_asid(info->mm);
|
||||
+ unsigned long addr = info->start;
|
||||
+
|
||||
+ /*
|
||||
+ * TLB flushes with INVLPGB are kicked off asynchronously.
|
||||
+ * The inc_mm_tlb_gen() guarantees page table updates are done
|
||||
+ * before these TLB flushes happen.
|
||||
+ */
|
||||
+ if (info->end == TLB_FLUSH_ALL) {
|
||||
+ invlpgb_flush_single_pcid_nosync(kern_pcid(asid));
|
||||
+ /* Do any CPUs supporting INVLPGB need PTI? */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_single_pcid_nosync(user_pcid(asid));
|
||||
+ } else do {
|
||||
+ unsigned long nr = 1;
|
||||
+
|
||||
+ if (info->stride_shift <= PMD_SHIFT) {
|
||||
+ nr = (info->end - addr) >> info->stride_shift;
|
||||
+ nr = clamp_val(nr, 1, invlpgb_count_max);
|
||||
+ }
|
||||
+
|
||||
+ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd);
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
|
||||
+ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd);
|
||||
+
|
||||
+ addr += nr << info->stride_shift;
|
||||
+ } while (addr < info->end);
|
||||
+
|
||||
+ finish_asid_transition(info);
|
||||
+
|
||||
+ /* Wait for the INVLPGBs kicked off above to finish. */
|
||||
+ __tlbsync();
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
* Given an ASID, flush the corresponding user ASID. We can delay this
|
||||
* until the next time we switch to it.
|
||||
*
|
||||
@@ -1275,9 +1374,12 @@ void flush_tlb_mm_range(struct mm_struct
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
|
||||
+ if (mm_global_asid(mm)) {
|
||||
+ broadcast_tlb_flush(info);
|
||||
+ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
|
||||
info->trim_cpumask = should_trim_cpumask(mm);
|
||||
flush_tlb_multi(mm_cpumask(mm), info);
|
||||
+ consider_global_asid(mm);
|
||||
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
|
||||
lockdep_assert_irqs_enabled();
|
||||
local_irq_disable();
|
@@ -1,83 +0,0 @@
|
||||
From 1994cff363a37aff5b1232ca9f757b02ae244956 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Tue, 25 Feb 2025 22:00:47 -0500
|
||||
Subject: x86/mm: Enable AMD translation cache extensions
|
||||
|
||||
With AMD TCE (translation cache extensions) only the intermediate mappings
|
||||
that cover the address range zapped by INVLPG / INVLPGB get invalidated,
|
||||
rather than all intermediate mappings getting zapped at every TLB invalidation.
|
||||
|
||||
This can help reduce the TLB miss rate, by keeping more intermediate mappings
|
||||
in the cache.
|
||||
|
||||
From the AMD manual:
|
||||
|
||||
Translation Cache Extension (TCE) Bit. Bit 15, read/write. Setting this bit to
|
||||
1 changes how the INVLPG, INVLPGB, and INVPCID instructions operate on TLB
|
||||
entries. When this bit is 0, these instructions remove the target PTE from the
|
||||
TLB as well as all upper-level table entries that are cached in the TLB,
|
||||
whether or not they are associated with the target PTE. When this bit is set,
|
||||
these instructions will remove the target PTE and only those upper-level
|
||||
entries that lead to the target PTE in the page table hierarchy, leaving
|
||||
unrelated upper-level entries intact.
|
||||
|
||||
[ bp: use cpu_has()... I know, it is a mess. ]
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250226030129.530345-13-riel@surriel.com
|
||||
---
|
||||
arch/x86/include/asm/msr-index.h | 2 ++
|
||||
arch/x86/kernel/cpu/amd.c | 4 ++++
|
||||
tools/arch/x86/include/asm/msr-index.h | 2 ++
|
||||
3 files changed, 8 insertions(+)
|
||||
|
||||
--- a/arch/x86/include/asm/msr-index.h
|
||||
+++ b/arch/x86/include/asm/msr-index.h
|
||||
@@ -25,6 +25,7 @@
|
||||
#define _EFER_SVME 12 /* Enable virtualization */
|
||||
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
|
||||
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
|
||||
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
|
||||
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
|
||||
|
||||
#define EFER_SCE (1<<_EFER_SCE)
|
||||
@@ -34,6 +35,7 @@
|
||||
#define EFER_SVME (1<<_EFER_SVME)
|
||||
#define EFER_LMSLE (1<<_EFER_LMSLE)
|
||||
#define EFER_FFXSR (1<<_EFER_FFXSR)
|
||||
+#define EFER_TCE (1<<_EFER_TCE)
|
||||
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
|
||||
|
||||
/*
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -1081,6 +1081,10 @@ static void init_amd(struct cpuinfo_x86
|
||||
|
||||
/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
|
||||
clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
|
||||
+
|
||||
+ /* Enable Translation Cache Extension */
|
||||
+ if (cpu_has(c, X86_FEATURE_TCE))
|
||||
+ msr_set_bit(MSR_EFER, _EFER_TCE);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
--- a/tools/arch/x86/include/asm/msr-index.h
|
||||
+++ b/tools/arch/x86/include/asm/msr-index.h
|
||||
@@ -25,6 +25,7 @@
|
||||
#define _EFER_SVME 12 /* Enable virtualization */
|
||||
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
|
||||
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
|
||||
+#define _EFER_TCE 15 /* Enable Translation Cache Extensions */
|
||||
#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
|
||||
|
||||
#define EFER_SCE (1<<_EFER_SCE)
|
||||
@@ -34,6 +35,7 @@
|
||||
#define EFER_SVME (1<<_EFER_SVME)
|
||||
#define EFER_LMSLE (1<<_EFER_LMSLE)
|
||||
#define EFER_FFXSR (1<<_EFER_FFXSR)
|
||||
+#define EFER_TCE (1<<_EFER_TCE)
|
||||
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
|
||||
|
||||
/*
|
@@ -1,121 +0,0 @@
|
||||
From 5932a2c8122050c4a2f71588778feb0677fe32b4 Mon Sep 17 00:00:00 2001
|
||||
From: Tom Lendacky <thomas.lendacky@amd.com>
|
||||
Date: Tue, 4 Mar 2025 12:59:56 +0100
|
||||
Subject: x86/mm: Always set the ASID valid bit for the INVLPGB instruction
|
||||
|
||||
When executing the INVLPGB instruction on a bare-metal host or hypervisor, if
|
||||
the ASID valid bit is not set, the instruction will flush the TLB entries that
|
||||
match the specified criteria for any ASID, not just the those of the host. If
|
||||
virtual machines are running on the system, this may result in inadvertent
|
||||
flushes of guest TLB entries.
|
||||
|
||||
When executing the INVLPGB instruction in a guest and the INVLPGB instruction is
|
||||
not intercepted by the hypervisor, the hardware will replace the requested ASID
|
||||
with the guest ASID and set the ASID valid bit before doing the broadcast
|
||||
invalidation. Thus a guest is only able to flush its own TLB entries.
|
||||
|
||||
So to limit the host TLB flushing reach, always set the ASID valid bit using an
|
||||
ASID value of 0 (which represents the host/hypervisor). This will will result in
|
||||
the desired effect in both host and guest.
|
||||
|
||||
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
|
||||
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Link: https://lore.kernel.org/r/20250304120449.GHZ8bsYYyEBOKQIxBm@fat_crate.local
|
||||
---
|
||||
arch/x86/include/asm/tlb.h | 58 +++++++++++++++++++++-----------------
|
||||
1 file changed, 32 insertions(+), 26 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlb.h
|
||||
+++ b/arch/x86/include/asm/tlb.h
|
||||
@@ -33,6 +33,27 @@ enum addr_stride {
|
||||
PMD_STRIDE = 1
|
||||
};
|
||||
|
||||
+/*
|
||||
+ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
|
||||
+ * of the three. For example:
|
||||
+ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
|
||||
+ * - FLAG_PCID: invalidate all TLB entries matching the PCID
|
||||
+ *
|
||||
+ * The first is used to invalidate (kernel) mappings at a particular
|
||||
+ * address across all processes.
|
||||
+ *
|
||||
+ * The latter invalidates all TLB entries matching a PCID.
|
||||
+ */
|
||||
+#define INVLPGB_FLAG_VA BIT(0)
|
||||
+#define INVLPGB_FLAG_PCID BIT(1)
|
||||
+#define INVLPGB_FLAG_ASID BIT(2)
|
||||
+#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
|
||||
+#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
|
||||
+#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
|
||||
+
|
||||
+/* The implied mode when all bits are clear: */
|
||||
+#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
|
||||
+
|
||||
#ifdef CONFIG_BROADCAST_TLB_FLUSH
|
||||
/*
|
||||
* INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
|
||||
@@ -40,14 +61,20 @@ enum addr_stride {
|
||||
* The INVLPGB instruction is weakly ordered, and a batch of invalidations can
|
||||
* be done in a parallel fashion.
|
||||
*
|
||||
- * The instruction takes the number of extra pages to invalidate, beyond
|
||||
- * the first page, while __invlpgb gets the more human readable number of
|
||||
- * pages to invalidate.
|
||||
+ * The instruction takes the number of extra pages to invalidate, beyond the
|
||||
+ * first page, while __invlpgb gets the more human readable number of pages to
|
||||
+ * invalidate.
|
||||
*
|
||||
* The bits in rax[0:2] determine respectively which components of the address
|
||||
* (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any*
|
||||
* address in the specified range matches.
|
||||
*
|
||||
+ * Since it is desired to only flush TLB entries for the ASID that is executing
|
||||
+ * the instruction (a host/hypervisor or a guest), the ASID valid bit should
|
||||
+ * always be set. On a host/hypervisor, the hardware will use the ASID value
|
||||
+ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will
|
||||
+ * use the actual ASID value of the guest.
|
||||
+ *
|
||||
* TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
|
||||
* this CPU have completed.
|
||||
*/
|
||||
@@ -55,9 +82,9 @@ static inline void __invlpgb(unsigned lo
|
||||
unsigned long addr, u16 nr_pages,
|
||||
enum addr_stride stride, u8 flags)
|
||||
{
|
||||
- u32 edx = (pcid << 16) | asid;
|
||||
+ u64 rax = addr | flags | INVLPGB_FLAG_ASID;
|
||||
u32 ecx = (stride << 31) | (nr_pages - 1);
|
||||
- u64 rax = addr | flags;
|
||||
+ u32 edx = (pcid << 16) | asid;
|
||||
|
||||
/* The low bits in rax are for flags. Verify addr is clean. */
|
||||
VM_WARN_ON_ONCE(addr & ~PAGE_MASK);
|
||||
@@ -93,27 +120,6 @@ static inline void __invlpgb_all(unsigne
|
||||
static inline void __tlbsync(void) { }
|
||||
#endif
|
||||
|
||||
-/*
|
||||
- * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
|
||||
- * of the three. For example:
|
||||
- * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address
|
||||
- * - FLAG_PCID: invalidate all TLB entries matching the PCID
|
||||
- *
|
||||
- * The first is used to invalidate (kernel) mappings at a particular
|
||||
- * address across all processes.
|
||||
- *
|
||||
- * The latter invalidates all TLB entries matching a PCID.
|
||||
- */
|
||||
-#define INVLPGB_FLAG_VA BIT(0)
|
||||
-#define INVLPGB_FLAG_PCID BIT(1)
|
||||
-#define INVLPGB_FLAG_ASID BIT(2)
|
||||
-#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3)
|
||||
-#define INVLPGB_FLAG_FINAL_ONLY BIT(4)
|
||||
-#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5)
|
||||
-
|
||||
-/* The implied mode when all bits are clear: */
|
||||
-#define INVLPGB_MODE_ALL_NONGLOBALS 0UL
|
||||
-
|
||||
static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
|
||||
unsigned long addr,
|
||||
u16 nr, bool stride)
|
@@ -1,70 +0,0 @@
|
||||
From 0e0a5ca37a8e3b06f450f4093ba1b6d6f33c2161 Mon Sep 17 00:00:00 2001
|
||||
From: Rik van Riel <riel@surriel.com>
|
||||
Date: Wed, 19 Mar 2025 13:25:20 -0400
|
||||
Subject: x86/mm: Only do broadcast flush from reclaim if pages were unmapped
|
||||
|
||||
Track whether pages were unmapped from any MM (even ones with a currently
|
||||
empty mm_cpumask) by the reclaim code, to figure out whether or not
|
||||
broadcast TLB flush should be done when reclaim finishes.
|
||||
|
||||
The reason any MM must be tracked, and not only ones contributing to the
|
||||
tlbbatch cpumask, is that broadcast ASIDs are expected to be kept up to
|
||||
date even on CPUs where the MM is not currently active.
|
||||
|
||||
This change allows reclaim to avoid doing TLB flushes when only clean page
|
||||
cache pages and/or slab memory were reclaimed, which is fairly common.
|
||||
|
||||
( This is a simpler alternative to the code that was in my INVLPGB series
|
||||
before, and it seems to capture most of the benefit due to how common
|
||||
it is to reclaim only page cache. )
|
||||
|
||||
Signed-off-by: Rik van Riel <riel@surriel.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Andy Lutomirski <luto@kernel.org>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Link: https://lore.kernel.org/r/20250319132520.6b10ad90@fangorn
|
||||
---
|
||||
arch/x86/include/asm/tlbbatch.h | 5 +++++
|
||||
arch/x86/include/asm/tlbflush.h | 1 +
|
||||
arch/x86/mm/tlb.c | 3 ++-
|
||||
3 files changed, 8 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/x86/include/asm/tlbbatch.h
|
||||
+++ b/arch/x86/include/asm/tlbbatch.h
|
||||
@@ -10,6 +10,11 @@ struct arch_tlbflush_unmap_batch {
|
||||
* the PFNs being flushed..
|
||||
*/
|
||||
struct cpumask cpumask;
|
||||
+ /*
|
||||
+ * Set if pages were unmapped from any MM, even one that does not
|
||||
+ * have active CPUs in its cpumask.
|
||||
+ */
|
||||
+ bool unmapped_pages;
|
||||
};
|
||||
|
||||
#endif /* _ARCH_X86_TLBBATCH_H */
|
||||
--- a/arch/x86/include/asm/tlbflush.h
|
||||
+++ b/arch/x86/include/asm/tlbflush.h
|
||||
@@ -353,6 +353,7 @@ static inline void arch_tlbbatch_add_pen
|
||||
{
|
||||
inc_mm_tlb_gen(mm);
|
||||
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
|
||||
+ batch->unmapped_pages = true;
|
||||
mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
|
||||
}
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -1648,8 +1648,9 @@ void arch_tlbbatch_flush(struct arch_tlb
|
||||
* a local TLB flush is needed. Optimize this use-case by calling
|
||||
* flush_tlb_func_local() directly in this case.
|
||||
*/
|
||||
- if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
|
||||
invlpgb_flush_all_nonglobals();
|
||||
+ batch->unmapped_pages = false;
|
||||
} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
|
||||
flush_tlb_multi(&batch->cpumask, info);
|
||||
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
@@ -1,92 +0,0 @@
|
||||
From 6ae491224973eb4013ee67a8c05c420f057d5fee Mon Sep 17 00:00:00 2001
|
||||
From: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Date: Thu, 8 May 2025 15:41:32 -0700
|
||||
Subject: x86/mm: Eliminate window where TLB flushes may be inadvertently
|
||||
skipped
|
||||
|
||||
tl;dr: There is a window in the mm switching code where the new CR3 is
|
||||
set and the CPU should be getting TLB flushes for the new mm. But
|
||||
should_flush_tlb() has a bug and suppresses the flush. Fix it by
|
||||
widening the window where should_flush_tlb() sends an IPI.
|
||||
|
||||
Long Version:
|
||||
|
||||
=== History ===
|
||||
|
||||
There were a few things leading up to this.
|
||||
|
||||
First, updating mm_cpumask() was observed to be too expensive, so it was
|
||||
made lazier. But being lazy caused too many unnecessary IPIs to CPUs
|
||||
due to the now-lazy mm_cpumask(). So code was added to cull
|
||||
mm_cpumask() periodically[2]. But that culling was a bit too aggressive
|
||||
and skipped sending TLB flushes to CPUs that need them. So here we are
|
||||
again.
|
||||
|
||||
=== Problem ===
|
||||
|
||||
The too-aggressive code in should_flush_tlb() strikes in this window:
|
||||
|
||||
// Turn on IPIs for this CPU/mm combination, but only
|
||||
// if should_flush_tlb() agrees:
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
load_new_mm_cr3(need_flush);
|
||||
// ^ After 'need_flush' is set to false, IPIs *MUST*
|
||||
// be sent to this CPU and not be ignored.
|
||||
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
||||
// ^ Not until this point does should_flush_tlb()
|
||||
// become true!
|
||||
|
||||
should_flush_tlb() will suppress TLB flushes between load_new_mm_cr3()
|
||||
and writing to 'loaded_mm', which is a window where they should not be
|
||||
suppressed. Whoops.
|
||||
|
||||
=== Solution ===
|
||||
|
||||
Thankfully, the fuzzy "just about to write CR3" window is already marked
|
||||
with loaded_mm==LOADED_MM_SWITCHING. Simply checking for that state in
|
||||
should_flush_tlb() is sufficient to ensure that the CPU is targeted with
|
||||
an IPI.
|
||||
|
||||
This will cause more TLB flush IPIs. But the window is relatively small
|
||||
and I do not expect this to cause any kind of measurable performance
|
||||
impact.
|
||||
|
||||
Update the comment where LOADED_MM_SWITCHING is written since it grew
|
||||
yet another user.
|
||||
|
||||
Peter Z also raised a concern that should_flush_tlb() might not observe
|
||||
'loaded_mm' and 'is_lazy' in the same order that switch_mm_irqs_off()
|
||||
writes them. Add a barrier to ensure that they are observed in the
|
||||
order they are written.
|
||||
|
||||
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Acked-by: Rik van Riel <riel@surriel.com>
|
||||
Link: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ [1]
|
||||
Fixes: 6db2526c1d69 ("x86/mm/tlb: Only trim the mm_cpumask once a second") [2]
|
||||
Reported-by: Stephen Dolan <sdolan@janestreet.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Acked-by: Ingo Molnar <mingo@kernel.org>
|
||||
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
||||
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
---
|
||||
arch/x86/mm/tlb.c | 22 +++++++++++++++++++---
|
||||
1 file changed, 19 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/arch/x86/mm/tlb.c
|
||||
+++ b/arch/x86/mm/tlb.c
|
||||
@@ -900,8 +900,9 @@ void switch_mm_irqs_off(struct mm_struct
|
||||
cond_mitigation(tsk);
|
||||
|
||||
/*
|
||||
- * Let nmi_uaccess_okay() and finish_asid_transition()
|
||||
- * know that CR3 is changing.
|
||||
+ * Indicate that CR3 is about to change. nmi_uaccess_okay()
|
||||
+ * and others are sensitive to the window where mm_cpumask(),
|
||||
+ * CR3 and cpu_tlbstate.loaded_mm are not all in sync.
|
||||
*/
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
|
||||
barrier();
|
@@ -1,4 +1,4 @@
|
||||
From e3d18eed972374cfbac1e58cf109209b07c1e27e Mon Sep 17 00:00:00 2001
|
||||
From 3400d11fad849dae6015e448c83d6e90f8a6ef35 Mon Sep 17 00:00:00 2001
|
||||
From: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Date: Tue, 8 Apr 2025 12:02:36 +0200
|
||||
Subject: ice: mark ice_write_prof_mask_reg() as noinline
|
@@ -1,4 +1,4 @@
|
||||
From f762c206076d274ecb0e2f3d9b6cbca361ebb246 Mon Sep 17 00:00:00 2001
|
||||
From 1615cc0c7d979a1c211f349c8c28ee8afb9ad57d Mon Sep 17 00:00:00 2001
|
||||
From: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Date: Thu, 1 May 2025 20:22:53 +0200
|
||||
Subject: wifi: mac80211: mark copy_mesh_setup() as noinline
|
39
debian/patches/patchset-pf/nfs/0001-NFSD-unregister-filesystem-in-case-genl_register_fam.patch
vendored
Normal file
39
debian/patches/patchset-pf/nfs/0001-NFSD-unregister-filesystem-in-case-genl_register_fam.patch
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
From c207229d3f7b851d246f1904bc4cab7ae9ada58b Mon Sep 17 00:00:00 2001
|
||||
From: Maninder Singh <maninder1.s@samsung.com>
|
||||
Date: Thu, 6 Mar 2025 14:50:06 +0530
|
||||
Subject: NFSD: unregister filesystem in case genl_register_family() fails
|
||||
|
||||
With rpc_status netlink support, unregister of register_filesystem()
|
||||
was missed in case of genl_register_family() fails.
|
||||
|
||||
Correcting it by making new label.
|
||||
|
||||
Fixes: bd9d6a3efa97 ("NFSD: add rpc_status netlink support")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
---
|
||||
fs/nfsd/nfsctl.c | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/nfsd/nfsctl.c
|
||||
+++ b/fs/nfsd/nfsctl.c
|
||||
@@ -2305,7 +2305,7 @@ static int __init init_nfsd(void)
|
||||
goto out_free_cld;
|
||||
retval = register_filesystem(&nfsd_fs_type);
|
||||
if (retval)
|
||||
- goto out_free_all;
|
||||
+ goto out_free_nfsd4;
|
||||
retval = genl_register_family(&nfsd_nl_family);
|
||||
if (retval)
|
||||
goto out_free_all;
|
||||
@@ -2313,6 +2313,8 @@ static int __init init_nfsd(void)
|
||||
|
||||
return 0;
|
||||
out_free_all:
|
||||
+ unregister_filesystem(&nfsd_fs_type);
|
||||
+out_free_nfsd4:
|
||||
nfsd4_destroy_laundry_wq();
|
||||
out_free_cld:
|
||||
unregister_cld_notifier();
|
162
debian/patches/patchset-pf/nfs/0002-NFSD-fix-race-between-nfsd-registration-and-exports_.patch
vendored
Normal file
162
debian/patches/patchset-pf/nfs/0002-NFSD-fix-race-between-nfsd-registration-and-exports_.patch
vendored
Normal file
@@ -0,0 +1,162 @@
|
||||
From bda3cf19bcf44807c401b807dee83aadda959287 Mon Sep 17 00:00:00 2001
|
||||
From: Maninder Singh <maninder1.s@samsung.com>
|
||||
Date: Thu, 6 Mar 2025 14:50:07 +0530
|
||||
Subject: NFSD: fix race between nfsd registration and exports_proc
|
||||
|
||||
As of now nfsd calls create_proc_exports_entry() at start of init_nfsd
|
||||
and cleanup by remove_proc_entry() at last of exit_nfsd.
|
||||
|
||||
Which causes kernel OOPs if there is race between below 2 operations:
|
||||
(i) exportfs -r
|
||||
(ii) mount -t nfsd none /proc/fs/nfsd
|
||||
|
||||
for 5.4 kernel ARM64:
|
||||
|
||||
CPU 1:
|
||||
el1_irq+0xbc/0x180
|
||||
arch_counter_get_cntvct+0x14/0x18
|
||||
running_clock+0xc/0x18
|
||||
preempt_count_add+0x88/0x110
|
||||
prep_new_page+0xb0/0x220
|
||||
get_page_from_freelist+0x2d8/0x1778
|
||||
__alloc_pages_nodemask+0x15c/0xef0
|
||||
__vmalloc_node_range+0x28c/0x478
|
||||
__vmalloc_node_flags_caller+0x8c/0xb0
|
||||
kvmalloc_node+0x88/0xe0
|
||||
nfsd_init_net+0x6c/0x108 [nfsd]
|
||||
ops_init+0x44/0x170
|
||||
register_pernet_operations+0x114/0x270
|
||||
register_pernet_subsys+0x34/0x50
|
||||
init_nfsd+0xa8/0x718 [nfsd]
|
||||
do_one_initcall+0x54/0x2e0
|
||||
|
||||
CPU 2 :
|
||||
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
|
||||
|
||||
PC is at : exports_net_open+0x50/0x68 [nfsd]
|
||||
|
||||
Call trace:
|
||||
exports_net_open+0x50/0x68 [nfsd]
|
||||
exports_proc_open+0x2c/0x38 [nfsd]
|
||||
proc_reg_open+0xb8/0x198
|
||||
do_dentry_open+0x1c4/0x418
|
||||
vfs_open+0x38/0x48
|
||||
path_openat+0x28c/0xf18
|
||||
do_filp_open+0x70/0xe8
|
||||
do_sys_open+0x154/0x248
|
||||
|
||||
Sometimes it crashes at exports_net_open() and sometimes cache_seq_next_rcu().
|
||||
|
||||
and same is happening on latest 6.14 kernel as well:
|
||||
|
||||
[ 0.000000] Linux version 6.14.0-rc5-next-20250304-dirty
|
||||
...
|
||||
[ 285.455918] Unable to handle kernel paging request at virtual address 00001f4800001f48
|
||||
...
|
||||
[ 285.464902] pc : cache_seq_next_rcu+0x78/0xa4
|
||||
...
|
||||
[ 285.469695] Call trace:
|
||||
[ 285.470083] cache_seq_next_rcu+0x78/0xa4 (P)
|
||||
[ 285.470488] seq_read+0xe0/0x11c
|
||||
[ 285.470675] proc_reg_read+0x9c/0xf0
|
||||
[ 285.470874] vfs_read+0xc4/0x2fc
|
||||
[ 285.471057] ksys_read+0x6c/0xf4
|
||||
[ 285.471231] __arm64_sys_read+0x1c/0x28
|
||||
[ 285.471428] invoke_syscall+0x44/0x100
|
||||
[ 285.471633] el0_svc_common.constprop.0+0x40/0xe0
|
||||
[ 285.471870] do_el0_svc_compat+0x1c/0x34
|
||||
[ 285.472073] el0_svc_compat+0x2c/0x80
|
||||
[ 285.472265] el0t_32_sync_handler+0x90/0x140
|
||||
[ 285.472473] el0t_32_sync+0x19c/0x1a0
|
||||
[ 285.472887] Code: f9400885 93407c23 937d7c27 11000421 (f86378a3)
|
||||
[ 285.473422] ---[ end trace 0000000000000000 ]---
|
||||
|
||||
It reproduced simply with below script:
|
||||
while [ 1 ]
|
||||
do
|
||||
/exportfs -r
|
||||
done &
|
||||
|
||||
while [ 1 ]
|
||||
do
|
||||
insmod /nfsd.ko
|
||||
mount -t nfsd none /proc/fs/nfsd
|
||||
umount /proc/fs/nfsd
|
||||
rmmod nfsd
|
||||
done &
|
||||
|
||||
So exporting interfaces to user space shall be done at last and
|
||||
cleanup at first place.
|
||||
|
||||
With change there is no Kernel OOPs.
|
||||
|
||||
Co-developed-by: Shubham Rana <s9.rana@samsung.com>
|
||||
Signed-off-by: Shubham Rana <s9.rana@samsung.com>
|
||||
Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
---
|
||||
fs/nfsd/nfsctl.c | 17 ++++++++---------
|
||||
1 file changed, 8 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/fs/nfsd/nfsctl.c
|
||||
+++ b/fs/nfsd/nfsctl.c
|
||||
@@ -2291,12 +2291,9 @@ static int __init init_nfsd(void)
|
||||
if (retval)
|
||||
goto out_free_pnfs;
|
||||
nfsd_lockd_init(); /* lockd->nfsd callbacks */
|
||||
- retval = create_proc_exports_entry();
|
||||
- if (retval)
|
||||
- goto out_free_lockd;
|
||||
retval = register_pernet_subsys(&nfsd_net_ops);
|
||||
if (retval < 0)
|
||||
- goto out_free_exports;
|
||||
+ goto out_free_lockd;
|
||||
retval = register_cld_notifier();
|
||||
if (retval)
|
||||
goto out_free_subsys;
|
||||
@@ -2308,11 +2305,16 @@ static int __init init_nfsd(void)
|
||||
goto out_free_nfsd4;
|
||||
retval = genl_register_family(&nfsd_nl_family);
|
||||
if (retval)
|
||||
+ goto out_free_filesystem;
|
||||
+ retval = create_proc_exports_entry();
|
||||
+ if (retval)
|
||||
goto out_free_all;
|
||||
nfsd_localio_ops_init();
|
||||
|
||||
return 0;
|
||||
out_free_all:
|
||||
+ genl_unregister_family(&nfsd_nl_family);
|
||||
+out_free_filesystem:
|
||||
unregister_filesystem(&nfsd_fs_type);
|
||||
out_free_nfsd4:
|
||||
nfsd4_destroy_laundry_wq();
|
||||
@@ -2320,9 +2322,6 @@ out_free_cld:
|
||||
unregister_cld_notifier();
|
||||
out_free_subsys:
|
||||
unregister_pernet_subsys(&nfsd_net_ops);
|
||||
-out_free_exports:
|
||||
- remove_proc_entry("fs/nfs/exports", NULL);
|
||||
- remove_proc_entry("fs/nfs", NULL);
|
||||
out_free_lockd:
|
||||
nfsd_lockd_shutdown();
|
||||
nfsd_drc_slab_free();
|
||||
@@ -2335,14 +2334,14 @@ out_free_slabs:
|
||||
|
||||
static void __exit exit_nfsd(void)
|
||||
{
|
||||
+ remove_proc_entry("fs/nfs/exports", NULL);
|
||||
+ remove_proc_entry("fs/nfs", NULL);
|
||||
genl_unregister_family(&nfsd_nl_family);
|
||||
unregister_filesystem(&nfsd_fs_type);
|
||||
nfsd4_destroy_laundry_wq();
|
||||
unregister_cld_notifier();
|
||||
unregister_pernet_subsys(&nfsd_net_ops);
|
||||
nfsd_drc_slab_free();
|
||||
- remove_proc_entry("fs/nfs/exports", NULL);
|
||||
- remove_proc_entry("fs/nfs", NULL);
|
||||
nfsd_lockd_shutdown();
|
||||
nfsd4_free_slabs();
|
||||
nfsd4_exit_pnfs();
|
35
debian/patches/patchset-pf/nfs/0003-nfsd-fix-access-checking-for-NLM-under-XPRTSEC-polic.patch
vendored
Normal file
35
debian/patches/patchset-pf/nfs/0003-nfsd-fix-access-checking-for-NLM-under-XPRTSEC-polic.patch
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
From b9293b51ea6182618e474edfbeb5cd34f5e875e8 Mon Sep 17 00:00:00 2001
|
||||
From: Olga Kornievskaia <okorniev@redhat.com>
|
||||
Date: Fri, 21 Mar 2025 20:13:04 -0400
|
||||
Subject: nfsd: fix access checking for NLM under XPRTSEC policies
|
||||
|
||||
When an export policy with xprtsec policy is set with "tls"
|
||||
and/or "mtls", but an NFS client is doing a v3 xprtsec=tls
|
||||
mount, then NLM locking calls fail with an error because
|
||||
there is currently no support for NLM with TLS.
|
||||
|
||||
Until such support is added, allow NLM calls under TLS-secured
|
||||
policy.
|
||||
|
||||
Fixes: 4cc9b9f2bf4d ("nfsd: refine and rename NFSD_MAY_LOCK")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
|
||||
Reviewed-by: NeilBrown <neil@brown.name>
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
---
|
||||
fs/nfsd/export.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/nfsd/export.c
|
||||
+++ b/fs/nfsd/export.c
|
||||
@@ -1124,7 +1124,8 @@ __be32 check_nfsd_access(struct svc_expo
|
||||
test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
|
||||
goto ok;
|
||||
}
|
||||
- goto denied;
|
||||
+ if (!may_bypass_gss)
|
||||
+ goto denied;
|
||||
|
||||
ok:
|
||||
/* legacy gss-only clients are always OK: */
|
32
debian/patches/patchset-pf/nfs/0004-nfsd-nfsd4_spo_must_allow-must-check-this-is-a-v4-co.patch
vendored
Normal file
32
debian/patches/patchset-pf/nfs/0004-nfsd-nfsd4_spo_must_allow-must-check-this-is-a-v4-co.patch
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
From 778e820deed49a0dee6115c0aa903e626ab635f6 Mon Sep 17 00:00:00 2001
|
||||
From: NeilBrown <neil@brown.name>
|
||||
Date: Fri, 28 Mar 2025 11:05:59 +1100
|
||||
Subject: nfsd: nfsd4_spo_must_allow() must check this is a v4 compound request
|
||||
|
||||
If the request being processed is not a v4 compound request, then
|
||||
examining the cstate can have undefined results.
|
||||
|
||||
This patch adds a check that the rpc procedure being executed
|
||||
(rq_procinfo) is the NFSPROC4_COMPOUND procedure.
|
||||
|
||||
Reported-by: Olga Kornievskaia <okorniev@redhat.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Signed-off-by: NeilBrown <neil@brown.name>
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
---
|
||||
fs/nfsd/nfs4proc.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/nfsd/nfs4proc.c
|
||||
+++ b/fs/nfsd/nfs4proc.c
|
||||
@@ -3766,7 +3766,8 @@ bool nfsd4_spo_must_allow(struct svc_rqs
|
||||
struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
|
||||
u32 opiter;
|
||||
|
||||
- if (!cstate->minorversion)
|
||||
+ if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] ||
|
||||
+ cstate->minorversion == 0)
|
||||
return false;
|
||||
|
||||
if (cstate->spo_must_allowed)
|
47
debian/patches/patchset-pf/nfs/0005-nfsd-Initialize-ssc-before-laundromat_work-to-preven.patch
vendored
Normal file
47
debian/patches/patchset-pf/nfs/0005-nfsd-Initialize-ssc-before-laundromat_work-to-preven.patch
vendored
Normal file
@@ -0,0 +1,47 @@
|
||||
From 8a7faf80fbb9ecdea403cb4f882354e8a5201acb Mon Sep 17 00:00:00 2001
|
||||
From: Li Lingfeng <lilingfeng3@huawei.com>
|
||||
Date: Mon, 14 Apr 2025 22:38:52 +0800
|
||||
Subject: nfsd: Initialize ssc before laundromat_work to prevent NULL
|
||||
dereference
|
||||
|
||||
In nfs4_state_start_net(), laundromat_work may access nfsd_ssc through
|
||||
nfs4_laundromat -> nfsd4_ssc_expire_umount. If nfsd_ssc isn't initialized,
|
||||
this can cause NULL pointer dereference.
|
||||
|
||||
Normally the delayed start of laundromat_work allows sufficient time for
|
||||
nfsd_ssc initialization to complete. However, when the kernel waits too
|
||||
long for userspace responses (e.g. in nfs4_state_start_net ->
|
||||
nfsd4_end_grace -> nfsd4_record_grace_done -> nfsd4_cld_grace_done ->
|
||||
cld_pipe_upcall -> __cld_pipe_upcall -> wait_for_completion path), the
|
||||
delayed work may start before nfsd_ssc initialization finishes.
|
||||
|
||||
Fix this by moving nfsd_ssc initialization before starting laundromat_work.
|
||||
|
||||
Fixes: f4e44b393389 ("NFSD: delay unmount source's export after inter-server copy completed.")
|
||||
Cc: stable@vger.kernel.org
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
---
|
||||
fs/nfsd/nfssvc.c | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/fs/nfsd/nfssvc.c
|
||||
+++ b/fs/nfsd/nfssvc.c
|
||||
@@ -396,13 +396,13 @@ static int nfsd_startup_net(struct net *
|
||||
if (ret)
|
||||
goto out_filecache;
|
||||
|
||||
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
|
||||
+ nfsd4_ssc_init_umount_work(nn);
|
||||
+#endif
|
||||
ret = nfs4_state_start_net(net);
|
||||
if (ret)
|
||||
goto out_reply_cache;
|
||||
|
||||
-#ifdef CONFIG_NFSD_V4_2_INTER_SSC
|
||||
- nfsd4_ssc_init_umount_work(nn);
|
||||
-#endif
|
||||
nn->nfsd_net_up = true;
|
||||
return 0;
|
||||
|
62
debian/patches/patchset-pf/nfs/0006-NFSD-Implement-FATTR4_CLONE_BLKSIZE-attribute.patch
vendored
Normal file
62
debian/patches/patchset-pf/nfs/0006-NFSD-Implement-FATTR4_CLONE_BLKSIZE-attribute.patch
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
From 12e39177848d11c6ac5ad70ce530364fac7f36d3 Mon Sep 17 00:00:00 2001
|
||||
From: Chuck Lever <chuck.lever@oracle.com>
|
||||
Date: Wed, 7 May 2025 10:45:15 -0400
|
||||
Subject: NFSD: Implement FATTR4_CLONE_BLKSIZE attribute
|
||||
|
||||
RFC 7862 states that if an NFS server implements a CLONE operation,
|
||||
it MUST also implement FATTR4_CLONE_BLKSIZE. NFSD implements CLONE,
|
||||
but does not implement FATTR4_CLONE_BLKSIZE.
|
||||
|
||||
Note that in Section 12.2, RFC 7862 claims that
|
||||
FATTR4_CLONE_BLKSIZE is RECOMMENDED, not REQUIRED. Likely this is
|
||||
because a minor version is not permitted to add a REQUIRED
|
||||
attribute. Confusing.
|
||||
|
||||
We assume this attribute reports a block size as a count of bytes,
|
||||
as RFC 7862 does not specify a unit.
|
||||
|
||||
Reported-by: Roland Mainz <roland.mainz@nrubsig.org>
|
||||
Suggested-by: Christoph Hellwig <hch@infradead.org>
|
||||
Reviewed-by: Roland Mainz <roland.mainz@nrubsig.org>
|
||||
Cc: stable@vger.kernel.org # v6.7+
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
---
|
||||
fs/nfsd/nfs4xdr.c | 19 ++++++++++++++++++-
|
||||
1 file changed, 18 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/nfsd/nfs4xdr.c
|
||||
+++ b/fs/nfsd/nfs4xdr.c
|
||||
@@ -3391,6 +3391,23 @@ static __be32 nfsd4_encode_fattr4_suppat
|
||||
return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Copied from generic_remap_checks/generic_remap_file_range_prep.
|
||||
+ *
|
||||
+ * These generic functions use the file system's s_blocksize, but
|
||||
+ * individual file systems aren't required to use
|
||||
+ * generic_remap_file_range_prep. Until there is a mechanism for
|
||||
+ * determining a particular file system's (or file's) clone block
|
||||
+ * size, this is the best NFSD can do.
|
||||
+ */
|
||||
+static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr,
|
||||
+ const struct nfsd4_fattr_args *args)
|
||||
+{
|
||||
+ struct inode *inode = d_inode(args->dentry);
|
||||
+
|
||||
+ return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize);
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
|
||||
static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
|
||||
const struct nfsd4_fattr_args *args)
|
||||
@@ -3545,7 +3562,7 @@ static const nfsd4_enc_attr nfsd4_enc_fa
|
||||
[FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop,
|
||||
[FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat,
|
||||
[FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop,
|
||||
- [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop,
|
||||
+ [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize,
|
||||
[FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop,
|
||||
[FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop,
|
||||
|
65
debian/patches/patchset-pf/nfs/0007-fs-nfs-read-fix-double-unlock-bug-in-nfs_return_empt.patch
vendored
Normal file
65
debian/patches/patchset-pf/nfs/0007-fs-nfs-read-fix-double-unlock-bug-in-nfs_return_empt.patch
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
From 2623f0468759aba585c7ae86adc1cf1cb11e1b63 Mon Sep 17 00:00:00 2001
|
||||
From: Max Kellermann <max.kellermann@ionos.com>
|
||||
Date: Wed, 23 Apr 2025 15:22:50 +0200
|
||||
Subject: fs/nfs/read: fix double-unlock bug in nfs_return_empty_folio()
|
||||
|
||||
Sometimes, when a file was read while it was being truncated by
|
||||
another NFS client, the kernel could deadlock because folio_unlock()
|
||||
was called twice, and the second call would XOR back the `PG_locked`
|
||||
flag.
|
||||
|
||||
Most of the time (depending on the timing of the truncation), nobody
|
||||
notices the problem because folio_unlock() gets called three times,
|
||||
which flips `PG_locked` back off:
|
||||
|
||||
1. vfs_read, nfs_read_folio, ... nfs_read_add_folio,
|
||||
nfs_return_empty_folio
|
||||
2. vfs_read, nfs_read_folio, ... netfs_read_collection,
|
||||
netfs_unlock_abandoned_read_pages
|
||||
3. vfs_read, ... nfs_do_read_folio, nfs_read_add_folio,
|
||||
nfs_return_empty_folio
|
||||
|
||||
The problem is that nfs_read_add_folio() is not supposed to unlock the
|
||||
folio if fscache is enabled, and a nfs_netfs_folio_unlock() check is
|
||||
missing in nfs_return_empty_folio().
|
||||
|
||||
Rarely this leads to a warning in netfs_read_collection():
|
||||
|
||||
------------[ cut here ]------------
|
||||
R=0000031c: folio 10 is not locked
|
||||
WARNING: CPU: 0 PID: 29 at fs/netfs/read_collect.c:133 netfs_read_collection+0x7c0/0xf00
|
||||
[...]
|
||||
Workqueue: events_unbound netfs_read_collection_worker
|
||||
RIP: 0010:netfs_read_collection+0x7c0/0xf00
|
||||
[...]
|
||||
Call Trace:
|
||||
<TASK>
|
||||
netfs_read_collection_worker+0x67/0x80
|
||||
process_one_work+0x12e/0x2c0
|
||||
worker_thread+0x295/0x3a0
|
||||
|
||||
Most of the time, however, processes just get stuck forever in
|
||||
folio_wait_bit_common(), waiting for `PG_locked` to disappear, which
|
||||
never happens because nobody is really holding the folio lock.
|
||||
|
||||
Fixes: 000dbe0bec05 ("NFS: Convert buffered read paths to use netfs when fscache is enabled")
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
|
||||
Reviewed-by: Dave Wysochanski <dwysocha@redhat.com>
|
||||
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
|
||||
---
|
||||
fs/nfs/read.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/nfs/read.c
|
||||
+++ b/fs/nfs/read.c
|
||||
@@ -56,7 +56,8 @@ static int nfs_return_empty_folio(struct
|
||||
{
|
||||
folio_zero_segment(folio, 0, folio_size(folio));
|
||||
folio_mark_uptodate(folio);
|
||||
- folio_unlock(folio);
|
||||
+ if (nfs_netfs_folio_unlock(folio))
|
||||
+ folio_unlock(folio);
|
||||
return 0;
|
||||
}
|
||||
|
32
debian/patches/patchset-pf/nfs/0008-NFSv4-Don-t-check-for-OPEN-feature-support-in-v4.1.patch
vendored
Normal file
32
debian/patches/patchset-pf/nfs/0008-NFSv4-Don-t-check-for-OPEN-feature-support-in-v4.1.patch
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
From d87e5957afccde6cc0719ab0a554757dcafa85ce Mon Sep 17 00:00:00 2001
|
||||
From: Scott Mayhew <smayhew@redhat.com>
|
||||
Date: Wed, 30 Apr 2025 07:12:29 -0400
|
||||
Subject: NFSv4: Don't check for OPEN feature support in v4.1
|
||||
|
||||
fattr4_open_arguments is a v4.2 recommended attribute, so we shouldn't
|
||||
be sending it to v4.1 servers.
|
||||
|
||||
Fixes: cb78f9b7d0c0 ("nfs: fix the fetch of FATTR4_OPEN_ARGUMENTS")
|
||||
Signed-off-by: Scott Mayhew <smayhew@redhat.com>
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
|
||||
Cc: stable@vger.kernel.org # 6.11+
|
||||
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
|
||||
---
|
||||
fs/nfs/nfs4proc.c | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/fs/nfs/nfs4proc.c
|
||||
+++ b/fs/nfs/nfs4proc.c
|
||||
@@ -3976,8 +3976,9 @@ static int _nfs4_server_capabilities(str
|
||||
FATTR4_WORD0_CASE_INSENSITIVE |
|
||||
FATTR4_WORD0_CASE_PRESERVING;
|
||||
if (minorversion)
|
||||
- bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT |
|
||||
- FATTR4_WORD2_OPEN_ARGUMENTS;
|
||||
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
|
||||
+ if (minorversion > 1)
|
||||
+ bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS;
|
||||
|
||||
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
|
||||
if (status == 0) {
|
96
debian/patches/patchset-pf/nfs/0009-NFS-always-probe-for-LOCALIO-support-asynchronously.patch
vendored
Normal file
96
debian/patches/patchset-pf/nfs/0009-NFS-always-probe-for-LOCALIO-support-asynchronously.patch
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
From 9e7464ef730cfe5bbab845ff12b295575d874216 Mon Sep 17 00:00:00 2001
|
||||
From: Mike Snitzer <snitzer@kernel.org>
|
||||
Date: Tue, 13 May 2025 12:08:31 -0400
|
||||
Subject: NFS: always probe for LOCALIO support asynchronously
|
||||
|
||||
It was reported that NFS client mounts of AWS Elastic File System
|
||||
(EFS) volumes is slow, this is because the AWS firewall disallows
|
||||
LOCALIO (because it doesn't consider the use of NFS_LOCALIO_PROGRAM
|
||||
valid), see: https://bugzilla.redhat.com/show_bug.cgi?id=2335129
|
||||
|
||||
Switch to performing the LOCALIO probe asynchronously to address the
|
||||
potential for the NFS LOCALIO protocol being disallowed and/or slowed
|
||||
by the remote server's response.
|
||||
|
||||
While at it, fix nfs_local_probe_async() to always take/put a
|
||||
reference on the nfs_client that is using the LOCALIO protocol.
|
||||
Also, unexport the nfs_local_probe() symbol and make it private to
|
||||
fs/nfs/localio.c
|
||||
|
||||
This change has the side-effect of initially issuing reads, writes and
|
||||
commits over the wire via SUNRPC until the LOCALIO probe completes.
|
||||
|
||||
Suggested-by: Jeff Layton <jlayton@kernel.org> # to always probe async
|
||||
Fixes: 76d4cb6345da ("nfs: probe for LOCALIO when v4 client reconnects to server")
|
||||
Cc: stable@vger.kernel.org # 6.14+
|
||||
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
|
||||
Reviewed-by: Jeff Layton <jlayton@kernel.org>
|
||||
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
|
||||
---
|
||||
fs/nfs/client.c | 2 +-
|
||||
fs/nfs/flexfilelayout/flexfilelayoutdev.c | 2 +-
|
||||
fs/nfs/internal.h | 1 -
|
||||
fs/nfs/localio.c | 6 ++++--
|
||||
4 files changed, 6 insertions(+), 5 deletions(-)
|
||||
|
||||
--- a/fs/nfs/client.c
|
||||
+++ b/fs/nfs/client.c
|
||||
@@ -439,7 +439,7 @@ struct nfs_client *nfs_get_client(const
|
||||
spin_unlock(&nn->nfs_client_lock);
|
||||
new = rpc_ops->init_client(new, cl_init);
|
||||
if (!IS_ERR(new))
|
||||
- nfs_local_probe(new);
|
||||
+ nfs_local_probe_async(new);
|
||||
return new;
|
||||
}
|
||||
|
||||
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
|
||||
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
|
||||
@@ -400,7 +400,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_la
|
||||
* keep ds_clp even if DS is local, so that if local IO cannot
|
||||
* proceed somehow, we can fall back to NFS whenever we want.
|
||||
*/
|
||||
- nfs_local_probe(ds->ds_clp);
|
||||
+ nfs_local_probe_async(ds->ds_clp);
|
||||
max_payload =
|
||||
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
|
||||
NULL);
|
||||
--- a/fs/nfs/internal.h
|
||||
+++ b/fs/nfs/internal.h
|
||||
@@ -455,7 +455,6 @@ extern int nfs_wait_bit_killable(struct
|
||||
|
||||
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
|
||||
/* localio.c */
|
||||
-extern void nfs_local_probe(struct nfs_client *);
|
||||
extern void nfs_local_probe_async(struct nfs_client *);
|
||||
extern void nfs_local_probe_async_work(struct work_struct *);
|
||||
extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
|
||||
--- a/fs/nfs/localio.c
|
||||
+++ b/fs/nfs/localio.c
|
||||
@@ -171,7 +171,7 @@ static bool nfs_server_uuid_is_local(str
|
||||
* - called after alloc_client and init_client (so cl_rpcclient exists)
|
||||
* - this function is idempotent, it can be called for old or new clients
|
||||
*/
|
||||
-void nfs_local_probe(struct nfs_client *clp)
|
||||
+static void nfs_local_probe(struct nfs_client *clp)
|
||||
{
|
||||
/* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */
|
||||
if (!localio_enabled ||
|
||||
@@ -191,14 +191,16 @@ void nfs_local_probe(struct nfs_client *
|
||||
nfs_localio_enable_client(clp);
|
||||
nfs_uuid_end(&clp->cl_uuid);
|
||||
}
|
||||
-EXPORT_SYMBOL_GPL(nfs_local_probe);
|
||||
|
||||
void nfs_local_probe_async_work(struct work_struct *work)
|
||||
{
|
||||
struct nfs_client *clp =
|
||||
container_of(work, struct nfs_client, cl_local_probe_work);
|
||||
|
||||
+ if (!refcount_inc_not_zero(&clp->cl_count))
|
||||
+ return;
|
||||
nfs_local_probe(clp);
|
||||
+ nfs_put_client(clp);
|
||||
}
|
||||
|
||||
void nfs_local_probe_async(struct nfs_client *clp)
|
29
debian/patches/patchset-pf/smb/0001-smb-client-add-NULL-check-in-automount_fullpath.patch
vendored
Normal file
29
debian/patches/patchset-pf/smb/0001-smb-client-add-NULL-check-in-automount_fullpath.patch
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
From 97831e31e43bb023d208b2344546a4e51e580dc6 Mon Sep 17 00:00:00 2001
|
||||
From: Ruben Devos <devosruben6@gmail.com>
|
||||
Date: Sun, 1 Jun 2025 19:18:55 +0200
|
||||
Subject: smb: client: add NULL check in automount_fullpath
|
||||
|
||||
page is checked for null in __build_path_from_dentry_optional_prefix
|
||||
when tcon->origin_fullpath is not set. However, the check is missing when
|
||||
it is set.
|
||||
Add a check to prevent a potential NULL pointer dereference.
|
||||
|
||||
Signed-off-by: Ruben Devos <devosruben6@gmail.com>
|
||||
Cc: stable@vger.kernel.org
|
||||
Signed-off-by: Steve French <stfrench@microsoft.com>
|
||||
---
|
||||
fs/smb/client/namespace.c | 3 +++
|
||||
1 file changed, 3 insertions(+)
|
||||
|
||||
--- a/fs/smb/client/namespace.c
|
||||
+++ b/fs/smb/client/namespace.c
|
||||
@@ -146,6 +146,9 @@ static char *automount_fullpath(struct d
|
||||
}
|
||||
spin_unlock(&tcon->tc_lock);
|
||||
|
||||
+ if (unlikely(!page))
|
||||
+ return ERR_PTR(-ENOMEM);
|
||||
+
|
||||
s = dentry_path_raw(dentry, page, PATH_MAX);
|
||||
if (IS_ERR(s))
|
||||
return s;
|
39
debian/patches/patchset-pf/smb/0002-cifs-reset-connections-for-all-channels-when-reconne.patch
vendored
Normal file
39
debian/patches/patchset-pf/smb/0002-cifs-reset-connections-for-all-channels-when-reconne.patch
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
From 0ca6d39b6d40b868eb6b4021f918de7a0f6a0f2e Mon Sep 17 00:00:00 2001
|
||||
From: Shyam Prasad N <sprasad@microsoft.com>
|
||||
Date: Mon, 2 Jun 2025 22:37:13 +0530
|
||||
Subject: cifs: reset connections for all channels when reconnect requested
|
||||
|
||||
cifs_reconnect can be called with a flag to mark the session as needing
|
||||
reconnect too. When this is done, we expect the connections of all
|
||||
channels to be reconnected too, which is not happening today.
|
||||
|
||||
Without doing this, we have seen bad things happen when primary and
|
||||
secondary channels are connected to different servers (in case of cloud
|
||||
services like Azure Files SMB).
|
||||
|
||||
This change would force all connections to reconnect as well, not just
|
||||
the sessions and tcons.
|
||||
|
||||
Cc: <stable@vger.kernel.org>
|
||||
Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
|
||||
Signed-off-by: Steve French <stfrench@microsoft.com>
|
||||
---
|
||||
fs/smb/client/connect.c | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
--- a/fs/smb/client/connect.c
|
||||
+++ b/fs/smb/client/connect.c
|
||||
@@ -377,6 +377,13 @@ static int __cifs_reconnect(struct TCP_S
|
||||
if (!cifs_tcp_ses_needs_reconnect(server, 1))
|
||||
return 0;
|
||||
|
||||
+ /*
|
||||
+ * if smb session has been marked for reconnect, also reconnect all
|
||||
+ * connections. This way, the other connections do not end up bad.
|
||||
+ */
|
||||
+ if (mark_smb_session)
|
||||
+ cifs_signal_cifsd_for_reconnect(server, mark_smb_session);
|
||||
+
|
||||
cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
|
||||
|
||||
cifs_abort_connection(server);
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user