release 6.11.11
This commit is contained in:
@@ -48,7 +48,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
struct quirk_entry {
|
||||
u32 nominal_freq;
|
||||
u32 lowest_freq;
|
||||
@@ -1402,7 +1398,7 @@ static ssize_t amd_pstate_show_status(ch
|
||||
@@ -1380,7 +1376,7 @@ static ssize_t amd_pstate_show_status(ch
|
||||
return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
|
||||
}
|
||||
|
||||
@@ -57,7 +57,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
{
|
||||
int mode_idx;
|
||||
|
||||
@@ -1419,6 +1415,7 @@ static int amd_pstate_update_status(cons
|
||||
@@ -1397,6 +1393,7 @@ static int amd_pstate_update_status(cons
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@@ -14,7 +14,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1605,7 +1605,7 @@ static void amd_pstate_epp_cpu_exit(stru
|
||||
@@ -1583,7 +1583,7 @@ static void amd_pstate_epp_cpu_exit(stru
|
||||
pr_debug("CPU %d exiting\n", policy->cpu);
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
|
||||
@@ -1655,7 +1655,7 @@ static void amd_pstate_epp_update_limit(
|
||||
@@ -1633,7 +1633,7 @@ static void amd_pstate_epp_update_limit(
|
||||
* This return value can only be negative for shared_memory
|
||||
* systems where EPP register read/write not supported.
|
||||
*/
|
||||
@@ -32,7 +32,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
}
|
||||
|
||||
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
@@ -1668,12 +1668,13 @@ static void amd_pstate_epp_update_limit(
|
||||
@@ -1646,12 +1646,13 @@ static void amd_pstate_epp_update_limit(
|
||||
}
|
||||
|
||||
WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
@@ -47,7 +47,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
|
||||
if (!policy->cpuinfo.max_freq)
|
||||
return -ENODEV;
|
||||
@@ -1683,7 +1684,9 @@ static int amd_pstate_epp_set_policy(str
|
||||
@@ -1661,7 +1662,9 @@ static int amd_pstate_epp_set_policy(str
|
||||
|
||||
cpudata->policy = policy->policy;
|
||||
|
||||
|
@@ -42,7 +42,7 @@ Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -1201,22 +1201,6 @@ unsigned long amd_get_dr_addr_mask(unsig
|
||||
@@ -1202,22 +1202,6 @@ unsigned long amd_get_dr_addr_mask(unsig
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
|
||||
|
||||
|
@@ -60,7 +60,7 @@ Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
* @cpu: CPU to get numerator for.
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -837,36 +837,6 @@ static void amd_pstste_sched_prefcore_wo
|
||||
@@ -815,36 +815,6 @@ static void amd_pstste_sched_prefcore_wo
|
||||
}
|
||||
static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
|
||||
|
||||
@@ -97,7 +97,7 @@ Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
#define CPPC_MAX_PERF U8_MAX
|
||||
|
||||
static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
|
||||
@@ -874,7 +844,7 @@ static void amd_pstate_init_prefcore(str
|
||||
@@ -852,7 +822,7 @@ static void amd_pstate_init_prefcore(str
|
||||
int ret, prio;
|
||||
u32 highest_perf;
|
||||
|
||||
@@ -106,7 +106,7 @@ Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
@@ -918,7 +888,7 @@ static void amd_pstate_update_limits(uns
|
||||
@@ -896,7 +866,7 @@ static void amd_pstate_update_limits(uns
|
||||
if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
|
||||
goto free_cpufreq_put;
|
||||
|
||||
|
@@ -149,7 +149,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
}
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -841,32 +841,18 @@ static DECLARE_WORK(sched_prefcore_work,
|
||||
@@ -819,32 +819,18 @@ static DECLARE_WORK(sched_prefcore_work,
|
||||
|
||||
static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
|
||||
{
|
||||
@@ -185,7 +185,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
|
||||
schedule_work(&sched_prefcore_work);
|
||||
}
|
||||
@@ -1037,12 +1023,12 @@ static int amd_pstate_cpu_init(struct cp
|
||||
@@ -1015,12 +1001,12 @@ static int amd_pstate_cpu_init(struct cp
|
||||
|
||||
cpudata->cpu = policy->cpu;
|
||||
|
||||
@@ -200,7 +200,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
ret = amd_pstate_init_freq(cpudata);
|
||||
if (ret)
|
||||
goto free_cpudata1;
|
||||
@@ -1503,12 +1489,12 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
@@ -1481,12 +1467,12 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
cpudata->cpu = policy->cpu;
|
||||
cpudata->epp_policy = 0;
|
||||
|
||||
@@ -215,7 +215,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
ret = amd_pstate_init_freq(cpudata);
|
||||
if (ret)
|
||||
goto free_cpudata1;
|
||||
@@ -1970,6 +1956,12 @@ static int __init amd_pstate_init(void)
|
||||
@@ -1948,6 +1934,12 @@ static int __init amd_pstate_init(void)
|
||||
static_call_update(amd_pstate_update_perf, cppc_update_perf);
|
||||
}
|
||||
|
||||
|
@@ -143,7 +143,7 @@ Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
|
||||
WRITE_ONCE(cpudata->lowest_nonlinear_perf,
|
||||
cppc_perf.lowest_nonlinear_perf);
|
||||
@@ -944,8 +910,8 @@ static u32 amd_pstate_get_transition_lat
|
||||
@@ -922,8 +888,8 @@ static u32 amd_pstate_get_transition_lat
|
||||
static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
|
||||
{
|
||||
int ret;
|
||||
@@ -154,7 +154,7 @@ Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
u32 nominal_perf, nominal_freq;
|
||||
u32 lowest_nonlinear_perf, lowest_nonlinear_freq;
|
||||
u32 boost_ratio, lowest_nonlinear_ratio;
|
||||
@@ -967,8 +933,10 @@ static int amd_pstate_init_freq(struct a
|
||||
@@ -945,8 +911,10 @@ static int amd_pstate_init_freq(struct a
|
||||
|
||||
nominal_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
|
||||
|
@@ -17,7 +17,7 @@ Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -836,17 +836,17 @@ static void amd_pstate_update_limits(uns
|
||||
@@ -814,17 +814,17 @@ static void amd_pstate_update_limits(uns
|
||||
|
||||
cpudata = policy->driver_data;
|
||||
|
||||
|
@@ -13,7 +13,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1796,7 +1796,7 @@ static int __init amd_pstate_set_driver(
|
||||
@@ -1774,7 +1774,7 @@ static int __init amd_pstate_set_driver(
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@@ -93,7 +93,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
u32 min_perf, u32 des_perf,
|
||||
u32 max_perf, bool fast_switch)
|
||||
{
|
||||
@@ -1919,9 +1919,9 @@ static int __init amd_pstate_init(void)
|
||||
@@ -1897,9 +1897,9 @@ static int __init amd_pstate_init(void)
|
||||
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
|
||||
} else {
|
||||
pr_debug("AMD CPPC shared memory based functionality is supported\n");
|
||||
|
@@ -33,7 +33,7 @@ Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1025,13 +1025,6 @@ static int amd_pstate_cpu_init(struct cp
|
||||
@@ -1003,13 +1003,6 @@ static int amd_pstate_cpu_init(struct cp
|
||||
if (cpu_feature_enabled(X86_FEATURE_CPPC))
|
||||
policy->fast_switch_possible = true;
|
||||
|
||||
@@ -47,7 +47,7 @@ Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
|
||||
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
|
||||
if (ret < 0) {
|
||||
@@ -1746,6 +1739,13 @@ static int amd_pstate_epp_resume(struct
|
||||
@@ -1724,6 +1717,13 @@ static int amd_pstate_epp_resume(struct
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -61,7 +61,7 @@ Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
static struct cpufreq_driver amd_pstate_driver = {
|
||||
.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
|
||||
.verify = amd_pstate_verify,
|
||||
@@ -1759,6 +1759,7 @@ static struct cpufreq_driver amd_pstate_
|
||||
@@ -1737,6 +1737,7 @@ static struct cpufreq_driver amd_pstate_
|
||||
.update_limits = amd_pstate_update_limits,
|
||||
.name = "amd-pstate",
|
||||
.attr = amd_pstate_attr,
|
||||
@@ -69,7 +69,7 @@ Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
};
|
||||
|
||||
static struct cpufreq_driver amd_pstate_epp_driver = {
|
||||
@@ -1775,6 +1776,7 @@ static struct cpufreq_driver amd_pstate_
|
||||
@@ -1753,6 +1754,7 @@ static struct cpufreq_driver amd_pstate_
|
||||
.set_boost = amd_pstate_set_boost,
|
||||
.name = "amd-pstate-epp",
|
||||
.attr = amd_pstate_epp_attr,
|
||||
|
@@ -16,7 +16,7 @@ Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -726,7 +726,7 @@ static int amd_pstate_cpu_boost_update(s
|
||||
@@ -704,7 +704,7 @@ static int amd_pstate_cpu_boost_update(s
|
||||
policy->max = policy->cpuinfo.max_freq;
|
||||
|
||||
if (cppc_state == AMD_PSTATE_PASSIVE) {
|
||||
@@ -25,7 +25,7 @@ Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
if (ret < 0)
|
||||
pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
|
||||
}
|
||||
@@ -993,17 +993,17 @@ static int amd_pstate_cpu_init(struct cp
|
||||
@@ -971,17 +971,17 @@ static int amd_pstate_cpu_init(struct cp
|
||||
|
||||
ret = amd_pstate_init_perf(cpudata);
|
||||
if (ret)
|
||||
@@ -46,7 +46,7 @@ Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
|
||||
min_freq = READ_ONCE(cpudata->min_freq);
|
||||
max_freq = READ_ONCE(cpudata->max_freq);
|
||||
@@ -1025,11 +1025,11 @@ static int amd_pstate_cpu_init(struct cp
|
||||
@@ -1003,11 +1003,11 @@ static int amd_pstate_cpu_init(struct cp
|
||||
if (cpu_feature_enabled(X86_FEATURE_CPPC))
|
||||
policy->fast_switch_possible = true;
|
||||
|
||||
@@ -60,7 +60,7 @@ Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
}
|
||||
|
||||
cpudata->max_limit_freq = max_freq;
|
||||
@@ -1042,9 +1042,7 @@ static int amd_pstate_cpu_init(struct cp
|
||||
@@ -1020,9 +1020,7 @@ static int amd_pstate_cpu_init(struct cp
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -71,7 +71,7 @@ Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
kfree(cpudata);
|
||||
return ret;
|
||||
}
|
||||
@@ -1053,8 +1051,7 @@ static void amd_pstate_cpu_exit(struct c
|
||||
@@ -1031,8 +1029,7 @@ static void amd_pstate_cpu_exit(struct c
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
|
||||
|
@@ -38,7 +38,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
|
||||
lowest_perf = READ_ONCE(cpudata->lowest_perf);
|
||||
if (min_limit_perf < lowest_perf)
|
||||
@@ -1526,10 +1531,13 @@ static int amd_pstate_epp_update_limit(s
|
||||
@@ -1504,10 +1509,13 @@ static int amd_pstate_epp_update_limit(s
|
||||
u64 value;
|
||||
s16 epp;
|
||||
|
||||
|
@@ -1,55 +0,0 @@
|
||||
From 01ad0fb3da95867947d923596a26b18d844afe3c Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Sat, 12 Oct 2024 12:45:17 -0500
|
||||
Subject: cpufreq/amd-pstate: Don't update CPPC request in
|
||||
amd_pstate_cpu_boost_update()
|
||||
|
||||
When boost is changed the CPPC value is changed in amd_pstate_cpu_boost_update()
|
||||
but then changed again when refresh_frequency_limits() and all it's callbacks
|
||||
occur. The first is a pointless write, so instead just update the limits for
|
||||
the policy and let the policy refresh anchor everything properly.
|
||||
|
||||
Fixes: c8c68c38b56f ("cpufreq: amd-pstate: initialize core precision boost state")
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 24 +-----------------------
|
||||
1 file changed, 1 insertion(+), 23 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -695,34 +695,12 @@ static void amd_pstate_adjust_perf(unsig
|
||||
static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- struct cppc_perf_ctrls perf_ctrls;
|
||||
- u32 highest_perf, nominal_perf, nominal_freq, max_freq;
|
||||
+ u32 nominal_freq, max_freq;
|
||||
int ret = 0;
|
||||
|
||||
- highest_perf = READ_ONCE(cpudata->highest_perf);
|
||||
- nominal_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
nominal_freq = READ_ONCE(cpudata->nominal_freq);
|
||||
max_freq = READ_ONCE(cpudata->max_freq);
|
||||
|
||||
- if (boot_cpu_has(X86_FEATURE_CPPC)) {
|
||||
- u64 value = READ_ONCE(cpudata->cppc_req_cached);
|
||||
-
|
||||
- value &= ~GENMASK_ULL(7, 0);
|
||||
- value |= on ? highest_perf : nominal_perf;
|
||||
- WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
-
|
||||
- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
|
||||
- } else {
|
||||
- perf_ctrls.max_perf = on ? highest_perf : nominal_perf;
|
||||
- ret = cppc_set_perf(cpudata->cpu, &perf_ctrls);
|
||||
- if (ret) {
|
||||
- cpufreq_cpu_release(policy);
|
||||
- pr_debug("Failed to set max perf on CPU:%d. ret:%d\n",
|
||||
- cpudata->cpu, ret);
|
||||
- return ret;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
if (on)
|
||||
policy->cpuinfo.max_freq = max_freq;
|
||||
else if (policy->cpuinfo.max_freq > nominal_freq * 1000)
|
@@ -1,33 +0,0 @@
|
||||
From e82b9b5a56bcac18cae68878fe67263279805735 Mon Sep 17 00:00:00 2001
|
||||
From: "Gautham R. Shenoy" <gautham.shenoy@amd.com>
|
||||
Date: Mon, 21 Oct 2024 15:48:35 +0530
|
||||
Subject: amd-pstate: Set min_perf to nominal_perf for active mode performance
|
||||
gov
|
||||
|
||||
The amd-pstate driver sets CPPC_REQ.min_perf to CPPC_REQ.max_perf when
|
||||
in active mode with performance governor. Typically CPPC_REQ.max_perf
|
||||
is set to CPPC.highest_perf. This causes frequency throttling on
|
||||
power-limited platforms which causes performance regressions on
|
||||
certain classes of workloads.
|
||||
|
||||
Hence, set the CPPC_REQ.min_perf to the CPPC.nominal_perf or
|
||||
CPPC_REQ.max_perf, whichever is lower of the two.
|
||||
|
||||
Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors")
|
||||
Signed-off-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1565,7 +1565,7 @@ static int amd_pstate_epp_update_limit(s
|
||||
value = READ_ONCE(cpudata->cppc_req_cached);
|
||||
|
||||
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
- min_perf = max_perf;
|
||||
+ min_perf = min(cpudata->nominal_perf, max_perf);
|
||||
|
||||
/* Initial min/max values for CPPC Performance Controls Register */
|
||||
value &= ~AMD_CPPC_MIN_PERF(~0L);
|
@@ -95,7 +95,7 @@ Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
policy->governor_data = sg_policy;
|
||||
sg_policy->tunables = tunables;
|
||||
|
||||
@@ -834,6 +870,8 @@ static int sugov_start(struct cpufreq_po
|
||||
@@ -833,6 +869,8 @@ static int sugov_start(struct cpufreq_po
|
||||
sg_policy->limits_changed = false;
|
||||
sg_policy->cached_raw_freq = 0;
|
||||
|
||||
|
@@ -314,7 +314,7 @@ Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
policy->governor_data = sg_policy;
|
||||
sg_policy->tunables = tunables;
|
||||
|
||||
@@ -870,8 +697,6 @@ static int sugov_start(struct cpufreq_po
|
||||
@@ -869,8 +696,6 @@ static int sugov_start(struct cpufreq_po
|
||||
sg_policy->limits_changed = false;
|
||||
sg_policy->cached_raw_freq = 0;
|
||||
|
||||
|
@@ -1,326 +0,0 @@
|
||||
From 11fa4cfe7134f44f2cdac4b25636fc3291096979 Mon Sep 17 00:00:00 2001
|
||||
From: Paolo Bonzini <pbonzini@redhat.com>
|
||||
Date: Fri, 8 Nov 2024 08:07:37 -0500
|
||||
Subject: KVM: x86: switch hugepage recovery thread to vhost_task
|
||||
|
||||
kvm_vm_create_worker_thread() is meant to be used for kthreads that
|
||||
can consume significant amounts of CPU time on behalf of a VM or in
|
||||
response to how the VM behaves (for example how it accesses its memory).
|
||||
Therefore it wants to charge the CPU time consumed by that work to
|
||||
the VM's container.
|
||||
|
||||
However, because of these threads, cgroups which have kvm instances inside
|
||||
never complete freezing. This can be trivially reproduced:
|
||||
|
||||
root@test ~# mkdir /sys/fs/cgroup/test
|
||||
root@test ~# echo $fish_pid > /sys/fs/cgroup/test/cgroup.procs
|
||||
root@test ~# qemu-system-x86_64 --nographic -enable-kvm
|
||||
|
||||
and in another terminal:
|
||||
|
||||
root@test ~# echo 1 > /sys/fs/cgroup/test/cgroup.freeze
|
||||
root@test ~# cat /sys/fs/cgroup/test/cgroup.events
|
||||
populated 1
|
||||
frozen 0
|
||||
|
||||
The cgroup freezing happens in the signal delivery path but
|
||||
kvm_vm_worker_thread() thread never call into the signal delivery path while
|
||||
joining non-root cgroups, so they never get frozen. Because the cgroup
|
||||
freezer determines whether a given cgroup is frozen by comparing the number
|
||||
of frozen threads to the total number of threads in the cgroup, the cgroup
|
||||
never becomes frozen and users waiting for the state transition may hang
|
||||
indefinitely.
|
||||
|
||||
Since the worker kthread is tied to a user process, it's better if
|
||||
it behaves similarly to user tasks as much as possible, including
|
||||
being able to send SIGSTOP and SIGCONT. In fact, vhost_task is all
|
||||
that kvm_vm_create_worker_thread() wanted to be and more: not only it
|
||||
inherits the userspace process's cgroups, it has other niceties like
|
||||
being parented properly in the process tree. Use it instead of the
|
||||
homegrown alternative.
|
||||
|
||||
(Commit message based on emails from Tejun).
|
||||
|
||||
Reported-by: Tejun Heo <tj@kernel.org>
|
||||
Reported-by: Luca Boccassi <bluca@debian.org>
|
||||
Tested-by: Luca Boccassi <bluca@debian.org>
|
||||
Acked-by: Tejun Heo <tj@kernel.org>
|
||||
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
|
||||
---
|
||||
arch/x86/include/asm/kvm_host.h | 4 +-
|
||||
arch/x86/kvm/Kconfig | 1 +
|
||||
arch/x86/kvm/mmu/mmu.c | 67 +++++++++++----------
|
||||
include/linux/kvm_host.h | 6 --
|
||||
virt/kvm/kvm_main.c | 103 --------------------------------
|
||||
5 files changed, 39 insertions(+), 142 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/kvm_host.h
|
||||
+++ b/arch/x86/include/asm/kvm_host.h
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <linux/irqbypass.h>
|
||||
#include <linux/hyperv.h>
|
||||
#include <linux/kfifo.h>
|
||||
+#include <linux/sched/vhost_task.h>
|
||||
|
||||
#include <asm/apic.h>
|
||||
#include <asm/pvclock-abi.h>
|
||||
@@ -1445,7 +1446,8 @@ struct kvm_arch {
|
||||
bool sgx_provisioning_allowed;
|
||||
|
||||
struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
|
||||
- struct task_struct *nx_huge_page_recovery_thread;
|
||||
+ struct vhost_task *nx_huge_page_recovery_thread;
|
||||
+ u64 nx_huge_page_next;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* The number of TDP MMU pages across all roots. */
|
||||
--- a/arch/x86/kvm/Kconfig
|
||||
+++ b/arch/x86/kvm/Kconfig
|
||||
@@ -29,6 +29,7 @@ config KVM
|
||||
select HAVE_KVM_IRQ_BYPASS
|
||||
select HAVE_KVM_IRQ_ROUTING
|
||||
select HAVE_KVM_READONLY_MEM
|
||||
+ select VHOST_TASK
|
||||
select KVM_ASYNC_PF
|
||||
select USER_RETURN_NOTIFIER
|
||||
select KVM_MMIO
|
||||
--- a/arch/x86/kvm/mmu/mmu.c
|
||||
+++ b/arch/x86/kvm/mmu/mmu.c
|
||||
@@ -7160,7 +7160,7 @@ static int set_nx_huge_pages(const char
|
||||
kvm_mmu_zap_all_fast(kvm);
|
||||
mutex_unlock(&kvm->slots_lock);
|
||||
|
||||
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
|
||||
}
|
||||
mutex_unlock(&kvm_lock);
|
||||
}
|
||||
@@ -7306,7 +7306,7 @@ static int set_nx_huge_pages_recovery_pa
|
||||
mutex_lock(&kvm_lock);
|
||||
|
||||
list_for_each_entry(kvm, &vm_list, vm_list)
|
||||
- wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
|
||||
|
||||
mutex_unlock(&kvm_lock);
|
||||
}
|
||||
@@ -7409,62 +7409,65 @@ static void kvm_recover_nx_huge_pages(st
|
||||
srcu_read_unlock(&kvm->srcu, rcu_idx);
|
||||
}
|
||||
|
||||
-static long get_nx_huge_page_recovery_timeout(u64 start_time)
|
||||
+#define NX_HUGE_PAGE_DISABLED (-1)
|
||||
+
|
||||
+static u64 get_nx_huge_page_recovery_next(void)
|
||||
{
|
||||
bool enabled;
|
||||
uint period;
|
||||
|
||||
enabled = calc_nx_huge_pages_recovery_period(&period);
|
||||
|
||||
- return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
|
||||
- : MAX_SCHEDULE_TIMEOUT;
|
||||
+ return enabled ? get_jiffies_64() + msecs_to_jiffies(period)
|
||||
+ : NX_HUGE_PAGE_DISABLED;
|
||||
}
|
||||
|
||||
-static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
|
||||
+static void kvm_nx_huge_page_recovery_worker_kill(void *data)
|
||||
{
|
||||
- u64 start_time;
|
||||
- long remaining_time;
|
||||
-
|
||||
- while (true) {
|
||||
- start_time = get_jiffies_64();
|
||||
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
|
||||
-
|
||||
- set_current_state(TASK_INTERRUPTIBLE);
|
||||
- while (!kthread_should_stop() && remaining_time > 0) {
|
||||
- schedule_timeout(remaining_time);
|
||||
- remaining_time = get_nx_huge_page_recovery_timeout(start_time);
|
||||
- set_current_state(TASK_INTERRUPTIBLE);
|
||||
- }
|
||||
+}
|
||||
|
||||
- set_current_state(TASK_RUNNING);
|
||||
+static bool kvm_nx_huge_page_recovery_worker(void *data)
|
||||
+{
|
||||
+ struct kvm *kvm = data;
|
||||
+ long remaining_time;
|
||||
|
||||
- if (kthread_should_stop())
|
||||
- return 0;
|
||||
+ if (kvm->arch.nx_huge_page_next == NX_HUGE_PAGE_DISABLED)
|
||||
+ return false;
|
||||
|
||||
- kvm_recover_nx_huge_pages(kvm);
|
||||
+ remaining_time = kvm->arch.nx_huge_page_next - get_jiffies_64();
|
||||
+ if (remaining_time > 0) {
|
||||
+ schedule_timeout(remaining_time);
|
||||
+ /* check for signals and come back */
|
||||
+ return true;
|
||||
}
|
||||
+
|
||||
+ __set_current_state(TASK_RUNNING);
|
||||
+ kvm_recover_nx_huge_pages(kvm);
|
||||
+ kvm->arch.nx_huge_page_next = get_nx_huge_page_recovery_next();
|
||||
+ return true;
|
||||
}
|
||||
|
||||
int kvm_mmu_post_init_vm(struct kvm *kvm)
|
||||
{
|
||||
- int err;
|
||||
-
|
||||
if (nx_hugepage_mitigation_hard_disabled)
|
||||
return 0;
|
||||
|
||||
- err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
|
||||
- "kvm-nx-lpage-recovery",
|
||||
- &kvm->arch.nx_huge_page_recovery_thread);
|
||||
- if (!err)
|
||||
- kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ kvm->arch.nx_huge_page_next = get_nx_huge_page_recovery_next();
|
||||
+ kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
|
||||
+ kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
|
||||
+ kvm, "kvm-nx-lpage-recovery");
|
||||
|
||||
- return err;
|
||||
+ if (!kvm->arch.nx_huge_page_recovery_thread)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
|
||||
{
|
||||
if (kvm->arch.nx_huge_page_recovery_thread)
|
||||
- kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
|
||||
+ vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
|
||||
--- a/include/linux/kvm_host.h
|
||||
+++ b/include/linux/kvm_host.h
|
||||
@@ -2370,12 +2370,6 @@ static inline int kvm_arch_vcpu_run_pid_
|
||||
}
|
||||
#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
|
||||
|
||||
-typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
|
||||
-
|
||||
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||||
- uintptr_t data, const char *name,
|
||||
- struct task_struct **thread_ptr);
|
||||
-
|
||||
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
|
||||
static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
--- a/virt/kvm/kvm_main.c
|
||||
+++ b/virt/kvm/kvm_main.c
|
||||
@@ -6573,106 +6573,3 @@ void kvm_exit(void)
|
||||
kvm_irqfd_exit();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_exit);
|
||||
-
|
||||
-struct kvm_vm_worker_thread_context {
|
||||
- struct kvm *kvm;
|
||||
- struct task_struct *parent;
|
||||
- struct completion init_done;
|
||||
- kvm_vm_thread_fn_t thread_fn;
|
||||
- uintptr_t data;
|
||||
- int err;
|
||||
-};
|
||||
-
|
||||
-static int kvm_vm_worker_thread(void *context)
|
||||
-{
|
||||
- /*
|
||||
- * The init_context is allocated on the stack of the parent thread, so
|
||||
- * we have to locally copy anything that is needed beyond initialization
|
||||
- */
|
||||
- struct kvm_vm_worker_thread_context *init_context = context;
|
||||
- struct task_struct *parent;
|
||||
- struct kvm *kvm = init_context->kvm;
|
||||
- kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
|
||||
- uintptr_t data = init_context->data;
|
||||
- int err;
|
||||
-
|
||||
- err = kthread_park(current);
|
||||
- /* kthread_park(current) is never supposed to return an error */
|
||||
- WARN_ON(err != 0);
|
||||
- if (err)
|
||||
- goto init_complete;
|
||||
-
|
||||
- err = cgroup_attach_task_all(init_context->parent, current);
|
||||
- if (err) {
|
||||
- kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
|
||||
- __func__, err);
|
||||
- goto init_complete;
|
||||
- }
|
||||
-
|
||||
- set_user_nice(current, task_nice(init_context->parent));
|
||||
-
|
||||
-init_complete:
|
||||
- init_context->err = err;
|
||||
- complete(&init_context->init_done);
|
||||
- init_context = NULL;
|
||||
-
|
||||
- if (err)
|
||||
- goto out;
|
||||
-
|
||||
- /* Wait to be woken up by the spawner before proceeding. */
|
||||
- kthread_parkme();
|
||||
-
|
||||
- if (!kthread_should_stop())
|
||||
- err = thread_fn(kvm, data);
|
||||
-
|
||||
-out:
|
||||
- /*
|
||||
- * Move kthread back to its original cgroup to prevent it lingering in
|
||||
- * the cgroup of the VM process, after the latter finishes its
|
||||
- * execution.
|
||||
- *
|
||||
- * kthread_stop() waits on the 'exited' completion condition which is
|
||||
- * set in exit_mm(), via mm_release(), in do_exit(). However, the
|
||||
- * kthread is removed from the cgroup in the cgroup_exit() which is
|
||||
- * called after the exit_mm(). This causes the kthread_stop() to return
|
||||
- * before the kthread actually quits the cgroup.
|
||||
- */
|
||||
- rcu_read_lock();
|
||||
- parent = rcu_dereference(current->real_parent);
|
||||
- get_task_struct(parent);
|
||||
- rcu_read_unlock();
|
||||
- cgroup_attach_task_all(parent, current);
|
||||
- put_task_struct(parent);
|
||||
-
|
||||
- return err;
|
||||
-}
|
||||
-
|
||||
-int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
|
||||
- uintptr_t data, const char *name,
|
||||
- struct task_struct **thread_ptr)
|
||||
-{
|
||||
- struct kvm_vm_worker_thread_context init_context = {};
|
||||
- struct task_struct *thread;
|
||||
-
|
||||
- *thread_ptr = NULL;
|
||||
- init_context.kvm = kvm;
|
||||
- init_context.parent = current;
|
||||
- init_context.thread_fn = thread_fn;
|
||||
- init_context.data = data;
|
||||
- init_completion(&init_context.init_done);
|
||||
-
|
||||
- thread = kthread_run(kvm_vm_worker_thread, &init_context,
|
||||
- "%s-%d", name, task_pid_nr(current));
|
||||
- if (IS_ERR(thread))
|
||||
- return PTR_ERR(thread);
|
||||
-
|
||||
- /* kthread_run is never supposed to return NULL */
|
||||
- WARN_ON(thread == NULL);
|
||||
-
|
||||
- wait_for_completion(&init_context.init_done);
|
||||
-
|
||||
- if (!init_context.err)
|
||||
- *thread_ptr = thread;
|
||||
-
|
||||
- return init_context.err;
|
||||
-}
|
Reference in New Issue
Block a user