1
0

release 6.12.4 (preliminary)

This commit is contained in:
2024-12-10 06:44:25 +03:00
parent 9debc8729c
commit 407e7bac82
246 changed files with 4681 additions and 5758 deletions

View File

@@ -1,4 +1,4 @@
From 3427331872c37b2edb42406c65764e1565b0591b Mon Sep 17 00:00:00 2001
From f077f4265c59f5d417aa40eaf82bb632c891b221 Mon Sep 17 00:00:00 2001
From: Perry Yuan <perry.yuan@amd.com>
Date: Fri, 9 Aug 2024 14:09:05 +0800
Subject: cpufreq: amd-pstate: add quirk for Ryzen 3000 series processor
@@ -17,7 +17,7 @@ Signed-off-by: Perry Yuan <perry.yuan@amd.com>
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -142,6 +142,11 @@ static struct quirk_entry quirk_amd_7k62
@@ -136,6 +136,11 @@ static struct quirk_entry quirk_amd_7k62
.lowest_freq = 550,
};
@@ -29,7 +29,7 @@ Signed-off-by: Perry Yuan <perry.yuan@amd.com>
static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
{
/**
@@ -158,6 +163,21 @@ static int __init dmi_matched_7k62_bios_
@@ -152,6 +157,21 @@ static int __init dmi_matched_7k62_bios_
return 0;
}
@@ -51,7 +51,7 @@ Signed-off-by: Perry Yuan <perry.yuan@amd.com>
static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = {
{
.callback = dmi_matched_7k62_bios_bug,
@@ -168,6 +188,16 @@ static const struct dmi_system_id amd_ps
@@ -162,6 +182,16 @@ static const struct dmi_system_id amd_ps
},
.driver_data = &quirk_amd_7k62,
},

View File

@@ -1,88 +0,0 @@
From 44f21855901b1fd618ac16b07dbd14e8fea4ee13 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 31 Aug 2024 21:49:11 -0500
Subject: cpufreq/amd-pstate: Export symbols for changing modes
In order to effectively test all mode switch combinations export
everything necessarily for amd-pstate-ut to trigger a mode switch.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 23 ++++++++++-------------
drivers/cpufreq/amd-pstate.h | 14 ++++++++++++++
2 files changed, 24 insertions(+), 13 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -60,18 +60,6 @@
#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF
#define AMD_CPPC_EPP_POWERSAVE 0xFF
-/*
- * enum amd_pstate_mode - driver working mode of amd pstate
- */
-enum amd_pstate_mode {
- AMD_PSTATE_UNDEFINED = 0,
- AMD_PSTATE_DISABLE,
- AMD_PSTATE_PASSIVE,
- AMD_PSTATE_ACTIVE,
- AMD_PSTATE_GUIDED,
- AMD_PSTATE_MAX,
-};
-
static const char * const amd_pstate_mode_string[] = {
[AMD_PSTATE_UNDEFINED] = "undefined",
[AMD_PSTATE_DISABLE] = "disable",
@@ -81,6 +69,14 @@ static const char * const amd_pstate_mod
NULL,
};
+const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode)
+{
+ if (mode < 0 || mode >= AMD_PSTATE_MAX)
+ return NULL;
+ return amd_pstate_mode_string[mode];
+}
+EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string);
+
struct quirk_entry {
u32 nominal_freq;
u32 lowest_freq;
@@ -1380,7 +1376,7 @@ static ssize_t amd_pstate_show_status(ch
return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
}
-static int amd_pstate_update_status(const char *buf, size_t size)
+int amd_pstate_update_status(const char *buf, size_t size)
{
int mode_idx;
@@ -1397,6 +1393,7 @@ static int amd_pstate_update_status(cons
return 0;
}
+EXPORT_SYMBOL_GPL(amd_pstate_update_status);
static ssize_t status_show(struct device *dev,
struct device_attribute *attr, char *buf)
--- a/drivers/cpufreq/amd-pstate.h
+++ b/drivers/cpufreq/amd-pstate.h
@@ -103,4 +103,18 @@ struct amd_cpudata {
bool boost_state;
};
+/*
+ * enum amd_pstate_mode - driver working mode of amd pstate
+ */
+enum amd_pstate_mode {
+ AMD_PSTATE_UNDEFINED = 0,
+ AMD_PSTATE_DISABLE,
+ AMD_PSTATE_PASSIVE,
+ AMD_PSTATE_ACTIVE,
+ AMD_PSTATE_GUIDED,
+ AMD_PSTATE_MAX,
+};
+const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode);
+int amd_pstate_update_status(const char *buf, size_t size);
+
#endif /* _LINUX_AMD_PSTATE_H */

View File

@@ -1,4 +1,4 @@
From 2e2ba39aec71fb51e897c3275b255ef806800cf0 Mon Sep 17 00:00:00 2001
From da2702cb43c82219d4624135d85e2fc8768aeb09 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:23:51 -0500
Subject: cpufreq/amd-pstate: Fix non kerneldoc comment
@@ -13,7 +13,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1774,7 +1774,7 @@ static int __init amd_pstate_set_driver(
@@ -1782,7 +1782,7 @@ static int __init amd_pstate_set_driver(
return -EINVAL;
}

View File

@@ -1,4 +1,4 @@
From d74ce254cc470da670d6b90c69bab553cdbde62b Mon Sep 17 00:00:00 2001
From a7b86a6057ccc8f5b5ab4d08e753b2a034fa7d28 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Tue, 17 Sep 2024 09:14:35 +0000
Subject: cpufreq/amd-pstate: Rename MSR and shared memory specific functions
@@ -93,7 +93,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
u32 min_perf, u32 des_perf,
u32 max_perf, bool fast_switch)
{
@@ -1897,9 +1897,9 @@ static int __init amd_pstate_init(void)
@@ -1905,9 +1905,9 @@ static int __init amd_pstate_init(void)
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
} else {
pr_debug("AMD CPPC shared memory based functionality is supported\n");

View File

@@ -1,77 +0,0 @@
From aabfc7370a7da9c52be97c79ba70a20201e6864a Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 31 Aug 2024 21:49:12 -0500
Subject: cpufreq/amd-pstate-ut: Add test case for mode switches
There is a state machine in the amd-pstate driver utilized for
switches for all modes. To make sure that cleanup and setup works
properly for each mode add a unit test case that tries all
combinations.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate-ut.c | 41 ++++++++++++++++++++++++++++++++-
1 file changed, 40 insertions(+), 1 deletion(-)
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -54,12 +54,14 @@ static void amd_pstate_ut_acpi_cpc_valid
static void amd_pstate_ut_check_enabled(u32 index);
static void amd_pstate_ut_check_perf(u32 index);
static void amd_pstate_ut_check_freq(u32 index);
+static void amd_pstate_ut_check_driver(u32 index);
static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = {
{"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid },
{"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled },
{"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf },
- {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }
+ {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq },
+ {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver }
};
static bool get_shared_mem(void)
@@ -257,6 +259,43 @@ skip_test:
cpufreq_cpu_put(policy);
}
+static int amd_pstate_set_mode(enum amd_pstate_mode mode)
+{
+ const char *mode_str = amd_pstate_get_mode_string(mode);
+
+ pr_debug("->setting mode to %s\n", mode_str);
+
+ return amd_pstate_update_status(mode_str, strlen(mode_str));
+}
+
+static void amd_pstate_ut_check_driver(u32 index)
+{
+ enum amd_pstate_mode mode1, mode2;
+ int ret;
+
+ for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {
+ ret = amd_pstate_set_mode(mode1);
+ if (ret)
+ goto out;
+ for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) {
+ if (mode1 == mode2)
+ continue;
+ ret = amd_pstate_set_mode(mode2);
+ if (ret)
+ goto out;
+ }
+ }
+out:
+ if (ret)
+ pr_warn("%s: failed to update status for %s->%s: %d\n", __func__,
+ amd_pstate_get_mode_string(mode1),
+ amd_pstate_get_mode_string(mode2), ret);
+
+ amd_pstate_ut_cases[index].result = ret ?
+ AMD_PSTATE_UT_RESULT_FAIL :
+ AMD_PSTATE_UT_RESULT_PASS;
+}
+
static int __init amd_pstate_ut_init(void)
{
u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases);

View File

@@ -1,60 +0,0 @@
From 24e62fbc101d079d398ac6fc76f458676d3d9491 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sun, 1 Sep 2024 00:00:35 -0500
Subject: cpufreq/amd-pstate: Catch failures for amd_pstate_epp_update_limit()
amd_pstate_set_epp() calls cppc_set_epp_perf() which can fail for
a variety of reasons but this is ignored. Change the return flow
to allow failures.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1583,7 +1583,7 @@ static void amd_pstate_epp_cpu_exit(stru
pr_debug("CPU %d exiting\n", policy->cpu);
}
-static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
+static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
@@ -1633,7 +1633,7 @@ static void amd_pstate_epp_update_limit(
* This return value can only be negative for shared_memory
* systems where EPP register read/write not supported.
*/
- return;
+ return epp;
}
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
@@ -1646,12 +1646,13 @@ static void amd_pstate_epp_update_limit(
}
WRITE_ONCE(cpudata->cppc_req_cached, value);
- amd_pstate_set_epp(cpudata, epp);
+ return amd_pstate_set_epp(cpudata, epp);
}
static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
+ int ret;
if (!policy->cpuinfo.max_freq)
return -ENODEV;
@@ -1661,7 +1662,9 @@ static int amd_pstate_epp_set_policy(str
cpudata->policy = policy->policy;
- amd_pstate_epp_update_limit(policy);
+ ret = amd_pstate_epp_update_limit(policy);
+ if (ret)
+ return ret;
/*
* policy->cur is never updated with the amd_pstate_epp driver, but it

View File

@@ -1,4 +1,4 @@
From 684d162c08ab86fff02861c907ecc92bf9c09af4 Mon Sep 17 00:00:00 2001
From 4e784acad1b275ddcb105b053922ab2179ab3f86 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 12 Oct 2024 12:45:18 -0500
Subject: cpufreq/amd-pstate: Use amd_pstate_update_min_max_limit() for EPP
@@ -17,7 +17,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1505,26 +1505,13 @@ static void amd_pstate_epp_cpu_exit(stru
@@ -1515,26 +1515,13 @@ static void amd_pstate_epp_cpu_exit(stru
static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;

View File

@@ -1,4 +1,4 @@
From fa46d2873c9fa4060ce407e4bc5c7e29babce9d0 Mon Sep 17 00:00:00 2001
From ed39a400d49baf070571bb1da475297a8495aa67 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 12 Oct 2024 12:45:19 -0500
Subject: cpufreq/amd-pstate: Drop needless EPP initialization
@@ -14,7 +14,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1548,12 +1548,6 @@ static int amd_pstate_epp_update_limit(s
@@ -1558,12 +1558,6 @@ static int amd_pstate_epp_update_limit(s
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
epp = 0;

View File

@@ -1,67 +0,0 @@
From 29c0347dd542e091e2f7e5980dd885f918f5f676 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:29:57 -0500
Subject: x86/amd: Move amd_get_highest_perf() from amd.c to cppc.c
To prepare to let amd_get_highest_perf() detect preferred cores
it will require CPPC functions. Move amd_get_highest_perf() to
cppc.c to prepare for 'preferred core detection' rework.
No functional changes intended.
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 16 ++++++++++++++++
arch/x86/kernel/cpu/amd.c | 16 ----------------
2 files changed, 16 insertions(+), 16 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -116,3 +116,19 @@ void init_freq_invariance_cppc(void)
init_done = true;
mutex_unlock(&freq_invariance_lock);
}
+
+u32 amd_get_highest_perf(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
+ (c->x86_model >= 0x70 && c->x86_model < 0x80)))
+ return 166;
+
+ if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
+ (c->x86_model >= 0x40 && c->x86_model < 0x70)))
+ return 166;
+
+ return 255;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1202,22 +1202,6 @@ unsigned long amd_get_dr_addr_mask(unsig
}
EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
-u32 amd_get_highest_perf(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
- return 166;
-
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
- return 166;
-
- return 255;
-}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
-
static void zenbleed_check_cpu(void *unused)
{
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());

View File

@@ -1,95 +0,0 @@
From 072efeb45349edd8ba9def11b6a450eaf56690a8 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:29:58 -0500
Subject: ACPI: CPPC: Adjust return code for inline functions in
!CONFIG_ACPI_CPPC_LIB
Checkpath emits the following warning:
```
WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP
```
Adjust the code accordingly.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
include/acpi/cppc_acpi.h | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -164,31 +164,31 @@ extern int cppc_set_auto_sel(int cpu, bo
#else /* !CONFIG_ACPI_CPPC_LIB */
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_set_enable(int cpu, bool enable)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline bool cppc_perf_ctrs_in_pcc(void)
{
@@ -212,27 +212,27 @@ static inline bool cpc_ffh_supported(voi
}
static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_set_auto_sel(int cpu, bool enable)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
#endif /* !CONFIG_ACPI_CPPC_LIB */

View File

@@ -0,0 +1,54 @@
From 2a012fdcef91db6e380000b8042f90544ea49fd7 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 17 Oct 2024 05:39:28 +0000
Subject: cpufreq/amd-pstate: Remove the redundant verify() function
Merge the two verify() callback functions and rename the
cpufreq_policy_data argument for better readability.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 15 ++++-----------
1 file changed, 4 insertions(+), 11 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -557,10 +557,10 @@ cpufreq_policy_put:
cpufreq_cpu_put(policy);
}
-static int amd_pstate_verify(struct cpufreq_policy_data *policy)
+static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
{
- cpufreq_verify_within_cpu_limits(policy);
-
+ cpufreq_verify_within_cpu_limits(policy_data);
+ pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min);
return 0;
}
@@ -1668,13 +1668,6 @@ static int amd_pstate_epp_cpu_offline(st
return 0;
}
-static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
-{
- cpufreq_verify_within_cpu_limits(policy);
- pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
- return 0;
-}
-
static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
@@ -1730,7 +1723,7 @@ static struct cpufreq_driver amd_pstate_
static struct cpufreq_driver amd_pstate_epp_driver = {
.flags = CPUFREQ_CONST_LOOPS,
- .verify = amd_pstate_epp_verify_policy,
+ .verify = amd_pstate_verify,
.setpolicy = amd_pstate_epp_set_policy,
.init = amd_pstate_epp_cpu_init,
.exit = amd_pstate_epp_cpu_exit,

View File

@@ -1,6 +1,6 @@
From f5b234be445a45b0bcacc37e0aad7a6bc7900eac Mon Sep 17 00:00:00 2001
From ba9f0c8ca71a1c44e35dd8d3221abda0bd4e465a Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 3 Oct 2024 08:39:54 +0000
Date: Thu, 17 Oct 2024 05:39:30 +0000
Subject: cpufreq/amd-pstate: Set the initial min_freq to lowest_nonlinear_freq
According to the AMD architectural programmer's manual volume 2 [1], in
@@ -20,60 +20,55 @@ are completed sooner). This has shown a power benefit in some systems,
in other systems, power consumption has increased but so has the
throughput/watt.
Use the get_init_min_freq() callback to set the initial lower limit for
amd-pstate driver to lowest_nonlinear_freq instead of lowest_freq.
Modify the initial policy_data->min set by cpufreq-core to
lowest_nonlinear_freq, in the ->verify() callback. Also set the
cpudata->req[0] to FREQ_QOS_MIN_DEFAULT_VALUE (i.e. 0), so that it also
gets overriden by the check in verify function.
Link: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/24593.pdf [1]
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
drivers/cpufreq/amd-pstate.c | 21 ++++++++++++++++++++-
1 file changed, 20 insertions(+), 1 deletion(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1003,13 +1003,6 @@ static int amd_pstate_cpu_init(struct cp
if (cpu_feature_enabled(X86_FEATURE_CPPC))
policy->fast_switch_possible = true;
@@ -559,8 +559,27 @@ cpufreq_policy_put:
- ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
- FREQ_QOS_MIN, policy->cpuinfo.min_freq);
- if (ret < 0) {
- dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
- goto free_cpudata1;
- }
-
ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
if (ret < 0) {
@@ -1724,6 +1717,13 @@ static int amd_pstate_epp_resume(struct
static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
{
+ /*
+ * Initialize lower frequency limit (i.e.policy->min) with
+ * lowest_nonlinear_frequency which is the most energy efficient
+ * frequency. Override the initial value set by cpufreq core and
+ * amd-pstate qos_requests.
+ */
+ if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
+ struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu);
+ struct amd_cpudata *cpudata;
+
+ if (!policy)
+ return -EINVAL;
+
+ cpudata = policy->driver_data;
+ policy_data->min = cpudata->lowest_nonlinear_freq;
+ cpufreq_cpu_put(policy);
+ }
+
cpufreq_verify_within_cpu_limits(policy_data);
pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min);
+
return 0;
}
+static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy)
+{
+ struct amd_cpudata *cpudata = policy->driver_data;
+
+ return READ_ONCE(cpudata->lowest_nonlinear_freq);
+}
+
static struct cpufreq_driver amd_pstate_driver = {
.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
.verify = amd_pstate_verify,
@@ -1737,6 +1737,7 @@ static struct cpufreq_driver amd_pstate_
.update_limits = amd_pstate_update_limits,
.name = "amd-pstate",
.attr = amd_pstate_attr,
+ .get_init_min_freq = amd_pstate_get_init_min_freq,
};
@@ -1009,7 +1028,7 @@ static int amd_pstate_cpu_init(struct cp
policy->fast_switch_possible = true;
static struct cpufreq_driver amd_pstate_epp_driver = {
@@ -1753,6 +1754,7 @@ static struct cpufreq_driver amd_pstate_
.set_boost = amd_pstate_set_boost,
.name = "amd-pstate-epp",
.attr = amd_pstate_epp_attr,
+ .get_init_min_freq = amd_pstate_get_init_min_freq,
};
static int __init amd_pstate_set_driver(int mode_idx)
ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
- FREQ_QOS_MIN, policy->cpuinfo.min_freq);
+ FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE);
if (ret < 0) {
dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
goto free_cpudata1;

View File

@@ -1,162 +0,0 @@
From 21492d91ffc7c3fdb6507f64a74abf8326c75141 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:29:59 -0500
Subject: x86/amd: Rename amd_get_highest_perf() to
amd_get_boost_ratio_numerator()
The function name is ambiguous because it returns an intermediate value
for calculating maximum frequency rather than the CPPC 'Highest Perf'
register.
Rename the function to clarify its use and allow the function to return
errors. Adjust the consumer in acpi-cpufreq to catch errors.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
arch/x86/include/asm/processor.h | 3 ---
arch/x86/kernel/acpi/cppc.c | 44 +++++++++++++++++++++++---------
drivers/cpufreq/acpi-cpufreq.c | 12 ++++++---
include/acpi/cppc_acpi.h | 5 ++++
4 files changed, 46 insertions(+), 18 deletions(-)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -691,8 +691,6 @@ static inline u32 per_cpu_l2c_id(unsigne
}
#ifdef CONFIG_CPU_SUP_AMD
-extern u32 amd_get_highest_perf(void);
-
/*
* Issue a DIV 0/1 insn to clear any division data from previous DIV
* operations.
@@ -705,7 +703,6 @@ static __always_inline void amd_clear_di
extern void amd_check_microcode(void);
#else
-static inline u32 amd_get_highest_perf(void) { return 0; }
static inline void amd_clear_divider(void) { }
static inline void amd_check_microcode(void) { }
#endif
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -69,7 +69,7 @@ int cpc_write_ffh(int cpunum, struct cpc
static void amd_set_max_freq_ratio(void)
{
struct cppc_perf_caps perf_caps;
- u64 highest_perf, nominal_perf;
+ u64 numerator, nominal_perf;
u64 perf_ratio;
int rc;
@@ -79,15 +79,19 @@ static void amd_set_max_freq_ratio(void)
return;
}
- highest_perf = amd_get_highest_perf();
+ rc = amd_get_boost_ratio_numerator(0, &numerator);
+ if (rc) {
+ pr_debug("Could not retrieve highest performance (%d)\n", rc);
+ return;
+ }
nominal_perf = perf_caps.nominal_perf;
- if (!highest_perf || !nominal_perf) {
- pr_debug("Could not retrieve highest or nominal performance\n");
+ if (!nominal_perf) {
+ pr_debug("Could not retrieve nominal performance\n");
return;
}
- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
+ perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
/* midpoint between max_boost and max_P */
perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
if (!perf_ratio) {
@@ -117,18 +121,34 @@ void init_freq_invariance_cppc(void)
mutex_unlock(&freq_invariance_lock);
}
-u32 amd_get_highest_perf(void)
+/**
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+ * @cpu: CPU to get numerator for.
+ * @numerator: Output variable for numerator.
+ *
+ * Determine the numerator to use for calculating the boost ratio on
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
+ * value. On other systems this will the highest performance register value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
- return 166;
+ (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
+ *numerator = 166;
+ return 0;
+ }
if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
- return 166;
+ (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
+ *numerator = 166;
+ return 0;
+ }
+ *numerator = 255;
- return 255;
+ return 0;
}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -642,10 +642,16 @@ static u64 get_max_boost_ratio(unsigned
return 0;
}
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
- highest_perf = amd_get_highest_perf();
- else
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ ret = amd_get_boost_ratio_numerator(cpu, &highest_perf);
+ if (ret) {
+ pr_debug("CPU%d: Unable to get boost ratio numerator (%d)\n",
+ cpu, ret);
+ return 0;
+ }
+ } else {
highest_perf = perf_caps.highest_perf;
+ }
nominal_perf = perf_caps.nominal_perf;
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum,
extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
extern int cppc_set_auto_sel(int cpu, bool enable);
+extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
#else /* !CONFIG_ACPI_CPPC_LIB */
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
{
@@ -234,6 +235,10 @@ static inline int cppc_get_auto_sel_caps
{
return -EOPNOTSUPP;
}
+static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+{
+ return -EOPNOTSUPP;
+}
#endif /* !CONFIG_ACPI_CPPC_LIB */
#endif /* _CPPC_ACPI_H*/

View File

@@ -1,35 +0,0 @@
From 6f10d066dce0f1781b514a0352f0b427a32b1bb2 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:00 -0500
Subject: ACPI: CPPC: Drop check for non zero perf ratio
perf_ratio is a u64 and SCHED_CAPACITY_SCALE is a large number.
Shifting by one will never have a zero value.
Drop the check.
Suggested-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -91,13 +91,8 @@ static void amd_set_max_freq_ratio(void)
return;
}
- perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
/* midpoint between max_boost and max_P */
- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
- if (!perf_ratio) {
- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
- return;
- }
+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
freq_invariance_set_perf_ratio(perf_ratio, false);
}

View File

@@ -1,4 +1,4 @@
From db147a0a6341822a15fd9c4cd51f8dc4a9a1747b Mon Sep 17 00:00:00 2001
From e50aedf7fcd345c6f2e22a942deae007d71582df Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 17 Oct 2024 10:05:27 +0000
Subject: cpufreq/amd-pstate: Call amd_pstate_register() in amd_pstate_init()

View File

@@ -1,44 +0,0 @@
From 8c142a91a58f24119e99d4e66b11890f4a4ef984 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:01 -0500
Subject: ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn
If the boost ratio isn't calculated properly for the system for any
reason this can cause other problems that are non-obvious.
Raise all messages to warn instead.
Suggested-by: Perry Yuan <Perry.Yuan@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -75,19 +75,19 @@ static void amd_set_max_freq_ratio(void)
rc = cppc_get_perf_caps(0, &perf_caps);
if (rc) {
- pr_debug("Could not retrieve perf counters (%d)\n", rc);
+ pr_warn("Could not retrieve perf counters (%d)\n", rc);
return;
}
rc = amd_get_boost_ratio_numerator(0, &numerator);
if (rc) {
- pr_debug("Could not retrieve highest performance (%d)\n", rc);
+ pr_warn("Could not retrieve highest performance (%d)\n", rc);
return;
}
nominal_perf = perf_caps.nominal_perf;
if (!nominal_perf) {
- pr_debug("Could not retrieve nominal performance\n");
+ pr_warn("Could not retrieve nominal performance\n");
return;
}

View File

@@ -1,4 +1,4 @@
From 7c658490b05f6ab4dd59e1c25e75ba1037f6cfeb Mon Sep 17 00:00:00 2001
From 658af2bf561ee3883369b2caca4d688983f943de Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 17 Oct 2024 10:05:29 +0000
Subject: cpufreq/amd-pstate: Call amd_pstate_set_driver() in

View File

@@ -1,4 +1,4 @@
From 55be5db97f4f52badc958463ee8d9cbc2ae91615 Mon Sep 17 00:00:00 2001
From 0f2929e03651c101f1a6fa8ccf40167eb48c1789 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 17 Oct 2024 10:05:31 +0000
Subject: cpufreq/amd-pstate: Remove the switch case in amd_pstate_init()

View File

@@ -1,138 +0,0 @@
From 952e7bdc4cf67603f230f8eb91818ad4676e5a83 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:02 -0500
Subject: x86/amd: Move amd_get_highest_perf() out of amd-pstate
amd_pstate_get_highest_perf() is a helper used to get the highest perf
value on AMD systems. It's used in amd-pstate as part of preferred
core handling, but applicable for acpi-cpufreq as well.
Move it out to cppc handling code as amd_get_highest_perf().
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 30 ++++++++++++++++++++++++++++++
drivers/cpufreq/amd-pstate.c | 34 ++--------------------------------
include/acpi/cppc_acpi.h | 5 +++++
3 files changed, 37 insertions(+), 32 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -116,6 +116,36 @@ void init_freq_invariance_cppc(void)
mutex_unlock(&freq_invariance_lock);
}
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address for highest performance value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ u64 val;
+ int ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+ if (ret)
+ goto out;
+
+ val = AMD_CPPC_HIGHEST_PERF(val);
+ } else {
+ ret = cppc_get_highest_perf(cpu, &val);
+ if (ret)
+ goto out;
+ }
+
+ WRITE_ONCE(*highest_perf, (u32)val);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
/**
* amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
* @cpu: CPU to get numerator for.
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -815,36 +815,6 @@ static void amd_pstste_sched_prefcore_wo
}
static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
-/*
- * Get the highest performance register value.
- * @cpu: CPU from which to get highest performance.
- * @highest_perf: Return address.
- *
- * Return: 0 for success, -EIO otherwise.
- */
-static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf)
-{
- int ret;
-
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
- u64 cap1;
-
- ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
- if (ret)
- return ret;
- WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
- } else {
- u64 cppc_highest_perf;
-
- ret = cppc_get_highest_perf(cpu, &cppc_highest_perf);
- if (ret)
- return ret;
- WRITE_ONCE(*highest_perf, cppc_highest_perf);
- }
-
- return (ret);
-}
-
#define CPPC_MAX_PERF U8_MAX
static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
@@ -852,7 +822,7 @@ static void amd_pstate_init_prefcore(str
int ret, prio;
u32 highest_perf;
- ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf);
+ ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
if (ret)
return;
@@ -896,7 +866,7 @@ static void amd_pstate_update_limits(uns
if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
goto free_cpufreq_put;
- ret = amd_pstate_get_highest_perf(cpu, &cur_high);
+ ret = amd_get_highest_perf(cpu, &cur_high);
if (ret)
goto free_cpufreq_put;
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum,
extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
extern int cppc_set_auto_sel(int cpu, bool enable);
+extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
#else /* !CONFIG_ACPI_CPPC_LIB */
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
@@ -235,6 +236,10 @@ static inline int cppc_get_auto_sel_caps
{
return -EOPNOTSUPP;
}
+static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ return -ENODEV;
+}
static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
{
return -EOPNOTSUPP;

View File

@@ -1,4 +1,4 @@
From 7305364888151cb9e6b435c5f219ccfd18132b58 Mon Sep 17 00:00:00 2001
From bec664db265db933a107bac290601eb89acb938b Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 17 Oct 2024 10:05:33 +0000
Subject: cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call

View File

@@ -1,251 +0,0 @@
From 3ab7da5bbf2087982dbfe2b0f2937d0dddc3afb1 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:03 -0500
Subject: x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()
AMD systems that support preferred cores will use "166" as their
numerator for max frequency calculations instead of "255".
Add a function for detecting preferred cores by looking at the
highest perf value on all cores.
If preferred cores are enabled return 166 and if disabled the
value in the highest perf register. As the function will be called
multiple times, cache the values for the boost numerator and if
preferred cores will be enabled in global variables.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 93 ++++++++++++++++++++++++++++++++----
drivers/cpufreq/amd-pstate.c | 34 +++++--------
include/acpi/cppc_acpi.h | 5 ++
3 files changed, 101 insertions(+), 31 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,16 @@
#include <asm/processor.h>
#include <asm/topology.h>
+#define CPPC_HIGHEST_PERF_PREFCORE 166
+
+enum amd_pref_core {
+ AMD_PREF_CORE_UNKNOWN = 0,
+ AMD_PREF_CORE_SUPPORTED,
+ AMD_PREF_CORE_UNSUPPORTED,
+};
+static enum amd_pref_core amd_pref_core_detected;
+static u64 boost_numerator;
+
/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
bool cpc_supported_by_cpu(void)
@@ -147,6 +157,66 @@ out:
EXPORT_SYMBOL_GPL(amd_get_highest_perf);
/**
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
+ * @detected: Output variable for the result of the detection.
+ *
+ * Determine whether CPUs in the system support preferred cores. On systems
+ * that support preferred cores, different highest perf values will be found
+ * on different cores. On other systems, the highest perf value will be the
+ * same on all cores.
+ *
+ * The result of the detection will be stored in the 'detected' parameter.
+ *
+ * Return: 0 for success, negative error code otherwise
+ */
+int amd_detect_prefcore(bool *detected)
+{
+ int cpu, count = 0;
+ u64 highest_perf[2] = {0};
+
+ if (WARN_ON(!detected))
+ return -EINVAL;
+
+ switch (amd_pref_core_detected) {
+ case AMD_PREF_CORE_SUPPORTED:
+ *detected = true;
+ return 0;
+ case AMD_PREF_CORE_UNSUPPORTED:
+ *detected = false;
+ return 0;
+ default:
+ break;
+ }
+
+ for_each_present_cpu(cpu) {
+ u32 tmp;
+ int ret;
+
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+
+ if (!count || (count == 1 && tmp != highest_perf[0]))
+ highest_perf[count++] = tmp;
+
+ if (count == 2)
+ break;
+ }
+
+ *detected = (count == 2);
+ boost_numerator = highest_perf[0];
+
+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
+ AMD_PREF_CORE_UNSUPPORTED;
+
+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
+ *detected ? "" : "un", highest_perf[0]);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+
+/**
* amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
* @cpu: CPU to get numerator for.
* @numerator: Output variable for numerator.
@@ -155,24 +225,27 @@ EXPORT_SYMBOL_GPL(amd_get_highest_perf);
* a CPU. On systems that support preferred cores, this will be a hardcoded
* value. On other systems this will the highest performance register value.
*
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
+ * the correct boost numerator will be returned to match hardware capabilities
+ * even if the preferred cores scheduling hints are not enabled.
+ *
* Return: 0 for success, negative error code otherwise.
*/
int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
- *numerator = 166;
- return 0;
- }
+ bool prefcore;
+ int ret;
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
- *numerator = 166;
+ ret = amd_detect_prefcore(&prefcore);
+ if (ret)
+ return ret;
+
+ /* without preferred cores, return the highest perf register value */
+ if (!prefcore) {
+ *numerator = boost_numerator;
return 0;
}
- *numerator = 255;
+ *numerator = CPPC_HIGHEST_PERF_PREFCORE;
return 0;
}
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -819,32 +819,18 @@ static DECLARE_WORK(sched_prefcore_work,
static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
{
- int ret, prio;
- u32 highest_perf;
-
- ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
- if (ret)
+ /* user disabled or not detected */
+ if (!amd_pstate_prefcore)
return;
cpudata->hw_prefcore = true;
- /* check if CPPC preferred core feature is enabled*/
- if (highest_perf < CPPC_MAX_PERF)
- prio = (int)highest_perf;
- else {
- pr_debug("AMD CPPC preferred core is unsupported!\n");
- cpudata->hw_prefcore = false;
- return;
- }
-
- if (!amd_pstate_prefcore)
- return;
/*
* The priorities can be set regardless of whether or not
* sched_set_itmt_support(true) has been called and it is valid to
* update them at any time after it has been called.
*/
- sched_set_itmt_core_prio(prio, cpudata->cpu);
+ sched_set_itmt_core_prio((int)READ_ONCE(cpudata->highest_perf), cpudata->cpu);
schedule_work(&sched_prefcore_work);
}
@@ -1015,12 +1001,12 @@ static int amd_pstate_cpu_init(struct cp
cpudata->cpu = policy->cpu;
- amd_pstate_init_prefcore(cpudata);
-
ret = amd_pstate_init_perf(cpudata);
if (ret)
goto free_cpudata1;
+ amd_pstate_init_prefcore(cpudata);
+
ret = amd_pstate_init_freq(cpudata);
if (ret)
goto free_cpudata1;
@@ -1481,12 +1467,12 @@ static int amd_pstate_epp_cpu_init(struc
cpudata->cpu = policy->cpu;
cpudata->epp_policy = 0;
- amd_pstate_init_prefcore(cpudata);
-
ret = amd_pstate_init_perf(cpudata);
if (ret)
goto free_cpudata1;
+ amd_pstate_init_prefcore(cpudata);
+
ret = amd_pstate_init_freq(cpudata);
if (ret)
goto free_cpudata1;
@@ -1948,6 +1934,12 @@ static int __init amd_pstate_init(void)
static_call_update(amd_pstate_update_perf, cppc_update_perf);
}
+ if (amd_pstate_prefcore) {
+ ret = amd_detect_prefcore(&amd_pstate_prefcore);
+ if (ret)
+ return ret;
+ }
+
/* enable amd pstate feature */
ret = amd_pstate_enable(true);
if (ret) {
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -163,6 +163,7 @@ extern int cppc_get_auto_sel_caps(int cp
extern int cppc_set_auto_sel(int cpu, bool enable);
extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
+extern int amd_detect_prefcore(bool *detected);
#else /* !CONFIG_ACPI_CPPC_LIB */
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
{
@@ -244,6 +245,10 @@ static inline int amd_get_boost_ratio_nu
{
return -EOPNOTSUPP;
}
+static inline int amd_detect_prefcore(bool *detected)
+{
+ return -ENODEV;
+}
#endif /* !CONFIG_ACPI_CPPC_LIB */
#endif /* _CPPC_ACPI_H*/

View File

@@ -1,169 +0,0 @@
From 68d89574b86625f4bd7a784fe9bcc221dc290e4f Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:04 -0500
Subject: cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into
amd_get_boost_ratio_numerator()
The special case in amd_pstate_highest_perf_set() is the value used
for calculating the boost numerator. Merge this into
amd_get_boost_ratio_numerator() and then use that to calculate boost
ratio.
This allows dropping more special casing of the highest perf value.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
Documentation/admin-guide/pm/amd-pstate.rst | 3 +-
arch/x86/kernel/acpi/cppc.c | 16 +++++++
drivers/cpufreq/amd-pstate.c | 52 ++++-----------------
3 files changed, 28 insertions(+), 43 deletions(-)
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -251,7 +251,8 @@ performance supported in `AMD CPPC Perfo
In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
table, so we need to expose it to sysfs. If boost is not active, but
still supported, this maximum frequency will be larger than the one in
-``cpuinfo``.
+``cpuinfo``. On systems that support preferred core, the driver will have
+different values for some cores than others.
This attribute is read-only.
``amd_pstate_lowest_nonlinear_freq``
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,7 @@
#include <asm/processor.h>
#include <asm/topology.h>
+#define CPPC_HIGHEST_PERF_PERFORMANCE 196
#define CPPC_HIGHEST_PERF_PREFCORE 166
enum amd_pref_core {
@@ -245,6 +246,21 @@ int amd_get_boost_ratio_numerator(unsign
*numerator = boost_numerator;
return 0;
}
+
+ /*
+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+ * the highest performance level is set to 196.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+ switch (boot_cpu_data.x86_model) {
+ case 0x70 ... 0x7f:
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ default:
+ break;
+ }
+ }
*numerator = CPPC_HIGHEST_PERF_PREFCORE;
return 0;
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -52,8 +52,6 @@
#define AMD_PSTATE_TRANSITION_LATENCY 20000
#define AMD_PSTATE_TRANSITION_DELAY 1000
#define AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY 600
-#define CPPC_HIGHEST_PERF_PERFORMANCE 196
-#define CPPC_HIGHEST_PERF_DEFAULT 166
#define AMD_CPPC_EPP_PERFORMANCE 0x00
#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80
@@ -398,43 +396,17 @@ static inline int amd_pstate_enable(bool
return static_call(amd_pstate_enable)(enable);
}
-static u32 amd_pstate_highest_perf_set(struct amd_cpudata *cpudata)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /*
- * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
- * the highest performance level is set to 196.
- * https://bugzilla.kernel.org/show_bug.cgi?id=218759
- */
- if (c->x86 == 0x19 && (c->x86_model >= 0x70 && c->x86_model <= 0x7f))
- return CPPC_HIGHEST_PERF_PERFORMANCE;
-
- return CPPC_HIGHEST_PERF_DEFAULT;
-}
-
static int pstate_init_perf(struct amd_cpudata *cpudata)
{
u64 cap1;
- u32 highest_perf;
int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
&cap1);
if (ret)
return ret;
- /* For platforms that do not support the preferred core feature, the
- * highest_pef may be configured with 166 or 255, to avoid max frequency
- * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as
- * the default max perf.
- */
- if (cpudata->hw_prefcore)
- highest_perf = amd_pstate_highest_perf_set(cpudata);
- else
- highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
-
- WRITE_ONCE(cpudata->highest_perf, highest_perf);
- WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
+ WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+ WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1));
WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
@@ -446,19 +418,13 @@ static int pstate_init_perf(struct amd_c
static int cppc_init_perf(struct amd_cpudata *cpudata)
{
struct cppc_perf_caps cppc_perf;
- u32 highest_perf;
int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
if (ret)
return ret;
- if (cpudata->hw_prefcore)
- highest_perf = amd_pstate_highest_perf_set(cpudata);
- else
- highest_perf = cppc_perf.highest_perf;
-
- WRITE_ONCE(cpudata->highest_perf, highest_perf);
- WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
+ WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
+ WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf);
WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
WRITE_ONCE(cpudata->lowest_nonlinear_perf,
cppc_perf.lowest_nonlinear_perf);
@@ -922,8 +888,8 @@ static u32 amd_pstate_get_transition_lat
static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
{
int ret;
- u32 min_freq;
- u32 highest_perf, max_freq;
+ u32 min_freq, max_freq;
+ u64 numerator;
u32 nominal_perf, nominal_freq;
u32 lowest_nonlinear_perf, lowest_nonlinear_freq;
u32 boost_ratio, lowest_nonlinear_ratio;
@@ -945,8 +911,10 @@ static int amd_pstate_init_freq(struct a
nominal_perf = READ_ONCE(cpudata->nominal_perf);
- highest_perf = READ_ONCE(cpudata->highest_perf);
- boost_ratio = div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
+ ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
+ if (ret)
+ return ret;
+ boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf);
max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);

View File

@@ -1,4 +1,4 @@
From 5886ef269d069c72ea952cb00699e16221289e8c Mon Sep 17 00:00:00 2001
From 82d4b19d01912cd9c2cfb3c638a43cc2b04ffc63 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 17 Oct 2024 12:34:39 -0500
Subject: cpufreq/amd-pstate-ut: Add fix for min freq unit test

View File

@@ -1,4 +1,4 @@
From 497447cf96a785a4edd0756da5d5718037f5687c Mon Sep 17 00:00:00 2001
From b1bc26a3ecd5afc4d6106332910fbdc3f6718acd Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 21 Oct 2024 15:48:36 +0530
Subject: amd-pstate: Switch to amd-pstate by default on some Server platforms

View File

@@ -1,42 +0,0 @@
From deed718125e73b6bf280dcebb80c39108226388c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:05 -0500
Subject: cpufreq: amd-pstate: Optimize amd_pstate_update_limits()
Don't take and release the mutex when prefcore isn't present and
avoid initialization of variables that will be initially set
in the function.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -814,17 +814,17 @@ static void amd_pstate_update_limits(uns
cpudata = policy->driver_data;
- mutex_lock(&amd_pstate_driver_lock);
- if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
- goto free_cpufreq_put;
+ if (!amd_pstate_prefcore)
+ return;
+ mutex_lock(&amd_pstate_driver_lock);
ret = amd_get_highest_perf(cpu, &cur_high);
if (ret)
goto free_cpufreq_put;
prev_high = READ_ONCE(cpudata->prefcore_ranking);
- if (prev_high != cur_high) {
- highest_perf_changed = true;
+ highest_perf_changed = (prev_high != cur_high);
+ if (highest_perf_changed) {
WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
if (cur_high < CPPC_MAX_PERF)

View File

@@ -1,29 +0,0 @@
From 391075a34e392c7cacd338a6b034a21a10679855 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:06 -0500
Subject: cpufreq: amd-pstate: Add documentation for `amd_pstate_hw_prefcore`
Explain that the sysfs file represents both preferred core being
enabled by the user and supported by the hardware.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
Documentation/admin-guide/pm/amd-pstate.rst | 5 +++++
1 file changed, 5 insertions(+)
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -263,6 +263,11 @@ lowest non-linear performance in `AMD CP
<perf_cap_>`_.)
This attribute is read-only.
+``amd_pstate_hw_prefcore``
+
+Whether the platform supports the preferred core feature and it has been
+enabled. This attribute is read-only.
+
``energy_performance_available_preferences``
A list of all the supported EPP preferences that could be used for

View File

@@ -0,0 +1,121 @@
From ca32f306d11e95ca133a2f90249bb4555c2742c3 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Wed, 23 Oct 2024 10:21:06 +0000
Subject: cpufreq/amd-pstate: Rename functions that enable CPPC
Explicitly rename functions that enable CPPC as *_cppc_*.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -336,7 +336,7 @@ static int amd_pstate_set_energy_pref_in
return ret;
}
-static inline int msr_enable(bool enable)
+static inline int msr_cppc_enable(bool enable)
{
int ret, cpu;
unsigned long logical_proc_id_mask = 0;
@@ -362,7 +362,7 @@ static inline int msr_enable(bool enable
return 0;
}
-static int shmem_enable(bool enable)
+static int shmem_cppc_enable(bool enable)
{
int cpu, ret = 0;
struct cppc_perf_ctrls perf_ctrls;
@@ -389,11 +389,11 @@ static int shmem_enable(bool enable)
return ret;
}
-DEFINE_STATIC_CALL(amd_pstate_enable, msr_enable);
+DEFINE_STATIC_CALL(amd_pstate_cppc_enable, msr_cppc_enable);
-static inline int amd_pstate_enable(bool enable)
+static inline int amd_pstate_cppc_enable(bool enable)
{
- return static_call(amd_pstate_enable)(enable);
+ return static_call(amd_pstate_cppc_enable)(enable);
}
static int msr_init_perf(struct amd_cpudata *cpudata)
@@ -1072,7 +1072,7 @@ static int amd_pstate_cpu_resume(struct
{
int ret;
- ret = amd_pstate_enable(true);
+ ret = amd_pstate_cppc_enable(true);
if (ret)
pr_err("failed to enable amd-pstate during resume, return %d\n", ret);
@@ -1083,7 +1083,7 @@ static int amd_pstate_cpu_suspend(struct
{
int ret;
- ret = amd_pstate_enable(false);
+ ret = amd_pstate_cppc_enable(false);
if (ret)
pr_err("failed to disable amd-pstate during suspend, return %d\n", ret);
@@ -1216,7 +1216,7 @@ static ssize_t show_energy_performance_p
static void amd_pstate_driver_cleanup(void)
{
- amd_pstate_enable(false);
+ amd_pstate_cppc_enable(false);
cppc_state = AMD_PSTATE_DISABLE;
current_pstate_driver = NULL;
}
@@ -1250,7 +1250,7 @@ static int amd_pstate_register_driver(in
cppc_state = mode;
- ret = amd_pstate_enable(true);
+ ret = amd_pstate_cppc_enable(true);
if (ret) {
pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n",
ret);
@@ -1629,7 +1629,7 @@ static void amd_pstate_epp_reenable(stru
u64 value, max_perf;
int ret;
- ret = amd_pstate_enable(true);
+ ret = amd_pstate_cppc_enable(true);
if (ret)
pr_err("failed to enable amd pstate during resume, return %d\n", ret);
@@ -1716,7 +1716,7 @@ static int amd_pstate_epp_suspend(struct
cpudata->suspended = true;
/* disable CPPC in lowlevel firmware */
- ret = amd_pstate_enable(false);
+ ret = amd_pstate_cppc_enable(false);
if (ret)
pr_err("failed to suspend, return %d\n", ret);
@@ -1891,7 +1891,7 @@ static int __init amd_pstate_init(void)
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
} else {
pr_debug("AMD CPPC shared memory based functionality is supported\n");
- static_call_update(amd_pstate_enable, shmem_enable);
+ static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable);
static_call_update(amd_pstate_init_perf, shmem_init_perf);
static_call_update(amd_pstate_update_perf, shmem_update_perf);
}
@@ -1916,7 +1916,7 @@ static int __init amd_pstate_init(void)
global_attr_free:
cpufreq_unregister_driver(current_pstate_driver);
- amd_pstate_enable(false);
+ amd_pstate_cppc_enable(false);
return ret;
}
device_initcall(amd_pstate_init);

View File

@@ -1,42 +0,0 @@
From 2ed9874f6dcafcc2bee7a922af9e1d1c62dbeb18 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:07 -0500
Subject: amd-pstate: Add missing documentation for
`amd_pstate_prefcore_ranking`
`amd_pstate_prefcore_ranking` reflects the dynamic rankings of a CPU
core based on platform conditions. Explicitly include it in the
documentation.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
Documentation/admin-guide/pm/amd-pstate.rst | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -252,7 +252,8 @@ In some ASICs, the highest CPPC performa
table, so we need to expose it to sysfs. If boost is not active, but
still supported, this maximum frequency will be larger than the one in
``cpuinfo``. On systems that support preferred core, the driver will have
-different values for some cores than others.
+different values for some cores than others and this will reflect the values
+advertised by the platform at bootup.
This attribute is read-only.
``amd_pstate_lowest_nonlinear_freq``
@@ -268,6 +269,12 @@ This attribute is read-only.
Whether the platform supports the preferred core feature and it has been
enabled. This attribute is read-only.
+``amd_pstate_prefcore_ranking``
+
+The performance ranking of the core. This number doesn't have any unit, but
+larger numbers are preferred at the time of reading. This can change at
+runtime based on platform conditions. This attribute is read-only.
+
``energy_performance_available_preferences``
A list of all the supported EPP preferences that could be used for

View File

@@ -0,0 +1,30 @@
From de31d4a11e894a73ed7ef2388fb27f0bb4036de3 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Wed, 23 Oct 2024 10:21:08 +0000
Subject: cpufreq/amd-pstate: Do not attempt to clear MSR_AMD_CPPC_ENABLE
MSR_AMD_CPPC_ENABLE is a write once register, i.e. attempting to clear
it is futile, it will not take effect. Hence, return if disable (0)
argument is passed to the msr_cppc_enable()
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -341,6 +341,12 @@ static inline int msr_cppc_enable(bool e
int ret, cpu;
unsigned long logical_proc_id_mask = 0;
+ /*
+ * MSR_AMD_CPPC_ENABLE is write-once, once set it cannot be cleared.
+ */
+ if (!enable)
+ return 0;
+
if (enable == cppc_enabled)
return 0;

View File

@@ -0,0 +1,39 @@
From c5eadae39ccdf6bada95c834147e285e445b12ad Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Wed, 23 Oct 2024 10:21:10 +0000
Subject: cpufreq/amd-pstate: Call cppc_set_epp_perf in the reenable function
The EPP value being set in perf_ctrls.energy_perf is not being propagated
to the shared memory, fix that.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1646,8 +1646,9 @@ static void amd_pstate_epp_reenable(stru
wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
} else {
perf_ctrls.max_perf = max_perf;
- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached);
cppc_set_perf(cpudata->cpu, &perf_ctrls);
+ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached);
+ cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
}
}
@@ -1688,8 +1689,9 @@ static void amd_pstate_epp_offline(struc
} else {
perf_ctrls.desired_perf = 0;
perf_ctrls.max_perf = min_perf;
- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);
cppc_set_perf(cpudata->cpu, &perf_ctrls);
+ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);
+ cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
}
mutex_unlock(&amd_pstate_limits_lock);
}

View File

@@ -0,0 +1,25 @@
From 275a7a8c95ac5d0c0ff586acd882d751478f6a00 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Wed, 23 Oct 2024 10:21:12 +0000
Subject: cpufreq/amd-pstate: Align offline flow of shared memory and MSR based
systems
Set min_perf to lowest_perf for shared memory systems, similar to the MSR
based systems.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 1 +
1 file changed, 1 insertion(+)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1688,6 +1688,7 @@ static void amd_pstate_epp_offline(struc
wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
} else {
perf_ctrls.desired_perf = 0;
+ perf_ctrls.min_perf = min_perf;
perf_ctrls.max_perf = min_perf;
cppc_set_perf(cpudata->cpu, &perf_ctrls);
perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);

View File

@@ -1,24 +0,0 @@
From 185e64a7e1a749593f3d6dadc666da9dda82d48c Mon Sep 17 00:00:00 2001
From: Qianqiang Liu <qianqiang.liu@163.com>
Date: Wed, 11 Sep 2024 07:39:24 +0800
Subject: cpufreq/amd-pstate-ut: Fix an "Uninitialized variables" issue
Using uninitialized value "mode2" when calling "amd_pstate_get_mode_string".
Set "mode2" to "AMD_PSTATE_DISABLE" by default.
Signed-off-by: Qianqiang Liu <qianqiang.liu@163.com>
---
drivers/cpufreq/amd-pstate-ut.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -270,7 +270,7 @@ static int amd_pstate_set_mode(enum amd_
static void amd_pstate_ut_check_driver(u32 index)
{
- enum amd_pstate_mode mode1, mode2;
+ enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE;
int ret;
for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {

View File

@@ -0,0 +1,60 @@
From 173172a4604a050ccb5a6745acd9fa90d01257a7 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 25 Oct 2024 12:14:55 -0500
Subject: x86/cpufeatures: Rename X86_FEATURE_FAST_CPPC to have AMD prefix
This feature is an AMD unique feature of some processors, so put
AMD into the name.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
arch/x86/include/asm/cpufeatures.h | 2 +-
arch/x86/kernel/cpu/scattered.c | 2 +-
drivers/cpufreq/amd-pstate.c | 2 +-
tools/arch/x86/include/asm/cpufeatures.h | 2 +-
4 files changed, 4 insertions(+), 4 deletions(-)
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -473,7 +473,7 @@
#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */
#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */
#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
-#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */
+#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* Fast CPPC */
/*
* BUG word(s)
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -45,7 +45,7 @@ static const struct cpuid_bit cpuid_bits
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
- { X86_FEATURE_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
+ { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
{ X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -880,7 +880,7 @@ static u32 amd_pstate_get_transition_del
transition_delay_ns = cppc_get_transition_latency(cpu);
if (transition_delay_ns == CPUFREQ_ETERNAL) {
- if (cpu_feature_enabled(X86_FEATURE_FAST_CPPC))
+ if (cpu_feature_enabled(X86_FEATURE_AMD_FAST_CPPC))
return AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY;
else
return AMD_PSTATE_TRANSITION_DELAY;
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -472,7 +472,7 @@
#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */
#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */
#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
-#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */
+#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */
/*
* BUG word(s)

View File

@@ -1,115 +0,0 @@
From 787175146e26a199c06be4e6bf8cf8da0f757271 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 3 Oct 2024 08:39:52 +0000
Subject: cpufreq: Add a callback to update the min_freq_req from drivers
Currently, there is no proper way to update the initial lower frequency
limit from cpufreq drivers. Only way is to add a new min_freq qos
request from the driver side, but it leads to the issue explained below.
The QoS infrastructure collates the constraints from multiple
subsystems and saves them in a plist. The "current value" is defined to
be the highest value in the plist for min_freq constraint.
The cpufreq core adds a qos_request for min_freq to be 0 and the amd-pstate
driver today adds qos request for min_freq to be lowest_freq, where
lowest_freq corresponds to CPPC.lowest_perf.
Eg: Suppose WLOG considering amd-pstate driver, lowest_freq is 400000 KHz,
lowest_non_linear_freq is 1200000 KHz.
At this point of time, the min_freq QoS plist looks like:
head--> 400000 KHz (registered by amd-pstate) --> 0 KHz (registered by
cpufreq core)
When a user updates /sys/devices/system/cpu/cpuX/cpufreq/scaling_min_freq,
it only results in updating the cpufreq-core's node in the plist, where
say 0 becomes the newly echoed value.
Now, if the user echoes a value 1000000 KHz, to scaling_min_freq, then the
new list would be
head--> 1000000 KHz (registered by cpufreq core) --> 400000 KHz (registered
by amd-pstate)
and the new "current value" of the min_freq QoS constraint will be 1000000
KHz, this is the scenario where it works as expected.
Suppose we change the amd-pstate driver code's min_freq qos constraint
to lowest_non_linear_freq instead of lowest_freq, then the user will
never be able to request a value below that, due to the following:
At boot time, the min_freq QoS plist would be
head--> 1200000 KHz (registered by amd-pstate) --> 0 KHz (registered by
cpufreq core)
When the user echoes a value of 1000000 KHz, to
/sys/devices/..../scaling_min_freq, then the new list would be
head--> 1200000 KHz (registered by amd-pstate) --> 1000000 KHz (registered
by cpufreq core)
with the new "current value" of the min_freq QoS remaining 1200000 KHz.
Since the current value has not changed, there won't be any notifications
sent to the subsystems which have added their QoS constraints. In
particular, the amd-pstate driver will not get the notification, and thus,
the user's request to lower the scaling_min_freq will be ineffective.
Hence, it is advisable to have a single source of truth for the min and
max freq QoS constraints between the cpufreq and the cpufreq drivers.
So add a new callback get_init_min_freq() add in struct cpufreq_driver,
which allows amd-pstate (or any other cpufreq driver) to override the
default min_freq value being set in the policy->min_freq_req. Now
scaling_min_freq can be modified by the user to any value (lower or
higher than the init value) later on if desired.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/cpufreq.c | 6 +++++-
include/linux/cpufreq.h | 6 ++++++
2 files changed, 11 insertions(+), 1 deletion(-)
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1380,6 +1380,7 @@ static int cpufreq_online(unsigned int c
bool new_policy;
unsigned long flags;
unsigned int j;
+ u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE;
int ret;
pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
@@ -1464,9 +1465,12 @@ static int cpufreq_online(unsigned int c
goto out_destroy_policy;
}
+ if (cpufreq_driver->get_init_min_freq)
+ init_min_freq = cpufreq_driver->get_init_min_freq(policy);
+
ret = freq_qos_add_request(&policy->constraints,
policy->min_freq_req, FREQ_QOS_MIN,
- FREQ_QOS_MIN_DEFAULT_VALUE);
+ init_min_freq);
if (ret < 0) {
/*
* So we don't call freq_qos_remove_request() for an
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -414,6 +414,12 @@ struct cpufreq_driver {
* policy is properly initialized, but before the governor is started.
*/
void (*register_em)(struct cpufreq_policy *policy);
+
+ /*
+ * Set by drivers that want to initialize the policy->min_freq_req with
+ * a value different from the default value (0) in cpufreq core.
+ */
+ int (*get_init_min_freq)(struct cpufreq_policy *policy);
};
/* flags */

View File

@@ -0,0 +1,43 @@
From 34ef29430b9a217ef0811f218b8f86c631d6c574 Mon Sep 17 00:00:00 2001
From: Perry Yuan <perry.yuan@amd.com>
Date: Fri, 25 Oct 2024 12:14:56 -0500
Subject: x86/cpufeatures: Add feature bits for AMD heterogeneous processor
CPUID leaf 0x80000026 advertises core types with different efficiency
rankings.
Bit 30 indicates the heterogeneous core topology feature, if the bit
set, it means not all instances at the current hierarchical level have
the same core topology.
This is described in the AMD64 Architecture Programmers Manual Volume
2 and 3, doc ID #25493 and #25494.
Signed-off-by: Perry Yuan <perry.yuan@amd.com>
Co-developed-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/kernel/cpu/scattered.c | 1 +
2 files changed, 2 insertions(+)
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -474,6 +474,7 @@
#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */
#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* Fast CPPC */
+#define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */
/*
* BUG word(s)
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -52,6 +52,7 @@ static const struct cpuid_bit cpuid_bits
{ X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
+ { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 },
{ 0, 0, 0, 0, 0 }
};

View File

@@ -0,0 +1,32 @@
From 5bd4c7b75f5588eb81bee35179569e27a4e164e4 Mon Sep 17 00:00:00 2001
From: Perry Yuan <perry.yuan@amd.com>
Date: Fri, 25 Oct 2024 12:14:57 -0500
Subject: x86/cpu: Enable SD_ASYM_PACKING for PKG Domain on AMD Processors
Enable the SD_ASYM_PACKING domain flag for the PKG domain on AMD
heterogeneous processors.
This flag is beneficial for processors with one or more CCDs and
relies on x86_sched_itmt_flags().
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Perry Yuan <perry.yuan@amd.com>
Co-developed-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
arch/x86/kernel/smpboot.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -497,8 +497,9 @@ static int x86_cluster_flags(void)
static int x86_die_flags(void)
{
- if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- return x86_sched_itmt_flags();
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU) ||
+ cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES))
+ return x86_sched_itmt_flags();
return 0;
}

View File

@@ -1,103 +0,0 @@
From f7b2b3a1c0d015c4272793bed89734c5cffb354c Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 3 Oct 2024 08:39:56 +0000
Subject: cpufreq/amd-pstate: Cleanup the old min_freq qos request remnants
Convert the freq_qos_request array in struct amd_cpudata to a single
variable (only for max_freq request). Remove the references to cpudata->req
array. Remove and rename the jump labels accordingly.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 19 ++++++++-----------
drivers/cpufreq/amd-pstate.h | 4 ++--
2 files changed, 10 insertions(+), 13 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -704,7 +704,7 @@ static int amd_pstate_cpu_boost_update(s
policy->max = policy->cpuinfo.max_freq;
if (cppc_state == AMD_PSTATE_PASSIVE) {
- ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq);
+ ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq);
if (ret < 0)
pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
}
@@ -971,17 +971,17 @@ static int amd_pstate_cpu_init(struct cp
ret = amd_pstate_init_perf(cpudata);
if (ret)
- goto free_cpudata1;
+ goto free_cpudata;
amd_pstate_init_prefcore(cpudata);
ret = amd_pstate_init_freq(cpudata);
if (ret)
- goto free_cpudata1;
+ goto free_cpudata;
ret = amd_pstate_init_boost_support(cpudata);
if (ret)
- goto free_cpudata1;
+ goto free_cpudata;
min_freq = READ_ONCE(cpudata->min_freq);
max_freq = READ_ONCE(cpudata->max_freq);
@@ -1003,11 +1003,11 @@ static int amd_pstate_cpu_init(struct cp
if (cpu_feature_enabled(X86_FEATURE_CPPC))
policy->fast_switch_possible = true;
- ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
+ ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req,
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
if (ret < 0) {
dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
- goto free_cpudata2;
+ goto free_cpudata;
}
cpudata->max_limit_freq = max_freq;
@@ -1020,9 +1020,7 @@ static int amd_pstate_cpu_init(struct cp
return 0;
-free_cpudata2:
- freq_qos_remove_request(&cpudata->req[0]);
-free_cpudata1:
+free_cpudata:
kfree(cpudata);
return ret;
}
@@ -1031,8 +1029,7 @@ static void amd_pstate_cpu_exit(struct c
{
struct amd_cpudata *cpudata = policy->driver_data;
- freq_qos_remove_request(&cpudata->req[1]);
- freq_qos_remove_request(&cpudata->req[0]);
+ freq_qos_remove_request(&cpudata->max_freq_req);
policy->fast_switch_possible = false;
kfree(cpudata);
}
--- a/drivers/cpufreq/amd-pstate.h
+++ b/drivers/cpufreq/amd-pstate.h
@@ -28,7 +28,7 @@ struct amd_aperf_mperf {
/**
* struct amd_cpudata - private CPU data for AMD P-State
* @cpu: CPU number
- * @req: constraint request to apply
+ * @max_freq_req: maximum frequency constraint request to apply
* @cppc_req_cached: cached performance request hints
* @highest_perf: the maximum performance an individual processor may reach,
* assuming ideal conditions
@@ -68,7 +68,7 @@ struct amd_aperf_mperf {
struct amd_cpudata {
int cpu;
- struct freq_qos_request req[2];
+ struct freq_qos_request max_freq_req;
u64 cppc_req_cached;
u32 highest_perf;

View File

@@ -0,0 +1,187 @@
From c30f11be765d5a5a68e975fbe720a4bdb6900388 Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Fri, 25 Oct 2024 12:14:58 -0500
Subject: x86/cpu: Add CPU type to struct cpuinfo_topology
Sometimes it is required to take actions based on if a CPU is a performance
or efficiency core. As an example, intel_pstate driver uses the Intel
core-type to determine CPU scaling. Also, some CPU vulnerabilities only
affect a specific CPU type, like RFDS only affects Intel Atom. Hybrid
systems that have variants P+E, P-only(Core) and E-only(Atom), it is not
straightforward to identify which variant is affected by a type specific
vulnerability.
Such processors do have CPUID field that can uniquely identify them. Like,
P+E, P-only and E-only enumerates CPUID.1A.CORE_TYPE identification, while
P+E additionally enumerates CPUID.7.HYBRID. Based on this information, it
is possible for boot CPU to identify if a system has mixed CPU types.
Add a new field hw_cpu_type to struct cpuinfo_topology that stores the
hardware specific CPU type. This saves the overhead of IPIs to get the CPU
type of a different CPU. CPU type is populated early in the boot process,
before vulnerabilities are enumerated.
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Co-developed-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
arch/x86/include/asm/intel-family.h | 6 +++++
arch/x86/include/asm/processor.h | 18 ++++++++++++++
arch/x86/include/asm/topology.h | 9 +++++++
arch/x86/kernel/cpu/debugfs.c | 1 +
arch/x86/kernel/cpu/topology_amd.c | 3 +++
arch/x86/kernel/cpu/topology_common.c | 34 +++++++++++++++++++++++++++
6 files changed, 71 insertions(+)
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -183,4 +183,10 @@
/* Family 19 */
#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */
+/* CPU core types */
+enum intel_cpu_type {
+ INTEL_CPU_TYPE_ATOM = 0x20,
+ INTEL_CPU_TYPE_CORE = 0x40,
+};
+
#endif /* _ASM_X86_INTEL_FAMILY_H */
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -105,6 +105,24 @@ struct cpuinfo_topology {
// Cache level topology IDs
u32 llc_id;
u32 l2c_id;
+
+ // Hardware defined CPU-type
+ union {
+ u32 cpu_type;
+ struct {
+ // CPUID.1A.EAX[23-0]
+ u32 intel_native_model_id :24;
+ // CPUID.1A.EAX[31-24]
+ u32 intel_type :8;
+ };
+ struct {
+ // CPUID 0x80000026.EBX
+ u32 amd_num_processors :16,
+ amd_power_eff_ranking :8,
+ amd_native_model_id :4,
+ amd_type :4;
+ };
+ };
};
struct cpuinfo_x86 {
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -114,6 +114,12 @@ enum x86_topology_domains {
TOPO_MAX_DOMAIN,
};
+enum x86_topology_cpu_type {
+ TOPO_CPU_TYPE_PERFORMANCE,
+ TOPO_CPU_TYPE_EFFICIENCY,
+ TOPO_CPU_TYPE_UNKNOWN,
+};
+
struct x86_topology_system {
unsigned int dom_shifts[TOPO_MAX_DOMAIN];
unsigned int dom_size[TOPO_MAX_DOMAIN];
@@ -149,6 +155,9 @@ extern unsigned int __max_threads_per_co
extern unsigned int __num_threads_per_package;
extern unsigned int __num_cores_per_package;
+const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c);
+enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c);
+
static inline unsigned int topology_max_packages(void)
{
return __max_logical_packages;
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -22,6 +22,7 @@ static int cpu_debug_show(struct seq_fil
seq_printf(m, "die_id: %u\n", c->topo.die_id);
seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
seq_printf(m, "core_id: %u\n", c->topo.core_id);
+ seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
--- a/arch/x86/kernel/cpu/topology_amd.c
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -182,6 +182,9 @@ static void parse_topology_amd(struct to
if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
has_topoext = cpu_parse_topology_ext(tscan);
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES))
+ tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
+
if (!has_topoext && !parse_8000_0008(tscan))
return;
--- a/arch/x86/kernel/cpu/topology_common.c
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -3,6 +3,7 @@
#include <xen/xen.h>
+#include <asm/intel-family.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/smp.h>
@@ -27,6 +28,36 @@ void topology_set_dom(struct topo_scan *
}
}
+enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ switch (c->topo.intel_type) {
+ case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY;
+ case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE;
+ }
+ }
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ switch (c->topo.amd_type) {
+ case 0: return TOPO_CPU_TYPE_PERFORMANCE;
+ case 1: return TOPO_CPU_TYPE_EFFICIENCY;
+ }
+ }
+
+ return TOPO_CPU_TYPE_UNKNOWN;
+}
+
+const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c)
+{
+ switch (get_topology_cpu_type(c)) {
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ return "performance";
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ return "efficiency";
+ default:
+ return "unknown";
+ }
+}
+
static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c)
{
struct {
@@ -87,6 +118,7 @@ static void parse_topology(struct topo_s
.cu_id = 0xff,
.llc_id = BAD_APICID,
.l2c_id = BAD_APICID,
+ .cpu_type = TOPO_CPU_TYPE_UNKNOWN,
};
struct cpuinfo_x86 *c = tscan->c;
struct {
@@ -132,6 +164,8 @@ static void parse_topology(struct topo_s
case X86_VENDOR_INTEL:
if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan))
parse_legacy(tscan);
+ if (c->cpuid_level >= 0x1a)
+ c->topo.cpu_type = cpuid_eax(0x1a);
break;
case X86_VENDOR_HYGON:
if (IS_ENABLED(CONFIG_CPU_SUP_HYGON))

View File

@@ -1,57 +0,0 @@
From c4fde0d177bdb33912f450914d84d6432391a8b5 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 12 Oct 2024 12:45:16 -0500
Subject: cpufreq/amd-pstate: Use nominal perf for limits when boost is
disabled
When boost has been disabled the limit for perf should be nominal perf not
the highest perf. Using the latter to do calculations will lead to
incorrect values that are still above nominal.
Fixes: ad4caad58d91 ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()")
Reported-by: Peter Jung <ptr1337@cachyos.org>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219348
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -566,11 +566,16 @@ static int amd_pstate_verify(struct cpuf
static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
{
- u32 max_limit_perf, min_limit_perf, lowest_perf;
+ u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf;
struct amd_cpudata *cpudata = policy->driver_data;
- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
+ if (cpudata->boost_supported && !policy->boost_enabled)
+ max_perf = READ_ONCE(cpudata->nominal_perf);
+ else
+ max_perf = READ_ONCE(cpudata->highest_perf);
+
+ max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
+ min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
lowest_perf = READ_ONCE(cpudata->lowest_perf);
if (min_limit_perf < lowest_perf)
@@ -1504,10 +1509,13 @@ static int amd_pstate_epp_update_limit(s
u64 value;
s16 epp;
- max_perf = READ_ONCE(cpudata->highest_perf);
+ if (cpudata->boost_supported && !policy->boost_enabled)
+ max_perf = READ_ONCE(cpudata->nominal_perf);
+ else
+ max_perf = READ_ONCE(cpudata->highest_perf);
min_perf = READ_ONCE(cpudata->lowest_perf);
- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
+ max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
+ min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
if (min_limit_perf < min_perf)
min_limit_perf = min_perf;

View File

@@ -0,0 +1,71 @@
From ab9618cbe5e3d55b09b59f5e18e890be80ca1076 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 25 Oct 2024 12:14:59 -0500
Subject: x86/amd: Use heterogeneous core topology for identifying boost
numerator
AMD heterogeneous designs include two types of cores:
* Performance
* Efficiency
Each core type has different highest performance values configured by the
platform. Drivers such as `amd_pstate` need to identify the type of
core to correctly set an appropriate boost numerator to calculate the
maximum frequency.
X86_FEATURE_AMD_HETEROGENEOUS_CORES is used to identify whether the SoC
supports heterogeneous core type by reading CPUID leaf Fn_0x80000026.
On performance cores the scaling factor of 196 is used. On efficiency
cores the scaling factor is the value reported as the highest perf.
Efficiency cores have the same preferred core rankings.
Tested-by: Eric Naim <dnaim@cachyos.org>
Tested-by: Peter Jung <ptr1337@cachyos.org>
Suggested-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -239,8 +239,10 @@ EXPORT_SYMBOL_GPL(amd_detect_prefcore);
*/
int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
{
+ enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu));
bool prefcore;
int ret;
+ u32 tmp;
ret = amd_detect_prefcore(&prefcore);
if (ret)
@@ -266,6 +268,27 @@ int amd_get_boost_ratio_numerator(unsign
break;
}
}
+
+ /* detect if running on heterogeneous design */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) {
+ switch (core_type) {
+ case TOPO_CPU_TYPE_UNKNOWN:
+ pr_warn("Undefined core type found for cpu %d\n", cpu);
+ break;
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ /* use the max scale for performance cores */
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ /* use the highest perf value for efficiency cores */
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+ *numerator = tmp;
+ return 0;
+ }
+ }
+
*numerator = CPPC_HIGHEST_PERF_PREFCORE;
return 0;

View File

@@ -1,4 +1,4 @@
From a4d255935a1ea6e4b10167df942ec641079bcdf7 Mon Sep 17 00:00:00 2001
From f1a423d6cfc0888638642bea1b8ed5c64770888c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 28 Oct 2024 09:55:41 -0500
Subject: cpufreq/amd-pstate: Push adjust_perf vfunc init into cpu_init
@@ -18,7 +18,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1528,6 +1528,8 @@ static int amd_pstate_epp_cpu_init(struc
@@ -1534,6 +1534,8 @@ static int amd_pstate_epp_cpu_init(struc
WRITE_ONCE(cpudata->cppc_cap1_cached, value);
}
@@ -27,7 +27,7 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
return 0;
free_cpudata1:
@@ -1887,8 +1889,6 @@ static int __init amd_pstate_init(void)
@@ -1896,8 +1898,6 @@ static int __init amd_pstate_init(void)
/* capability check */
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
pr_debug("AMD CPPC MSR based functionality is supported\n");
@@ -35,4 +35,4 @@ Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
- current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
} else {
pr_debug("AMD CPPC shared memory based functionality is supported\n");
static_call_update(amd_pstate_enable, shmem_enable);
static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable);

View File

@@ -0,0 +1,47 @@
From 242ffb086a15c8aec549096a6ad96d186b670fa9 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 28 Oct 2024 09:55:42 -0500
Subject: cpufreq/amd-pstate: Move registration after static function call
update
On shared memory designs the static functions need to work before
registration is done or the system can hang at bootup.
Move the registration later in amd_pstate_init() to solve this.
Fixes: e238968a2087 ("cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call")
Reported-by: Klara Modin <klarasmodin@gmail.com>
Closes: https://lore.kernel.org/linux-pm/cf9c146d-bacf-444e-92e2-15ebf513af96@gmail.com/#t
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1889,12 +1889,6 @@ static int __init amd_pstate_init(void)
return -ENODEV;
}
- ret = amd_pstate_register_driver(cppc_state);
- if (ret) {
- pr_err("failed to register with return %d\n", ret);
- return ret;
- }
-
/* capability check */
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
pr_debug("AMD CPPC MSR based functionality is supported\n");
@@ -1905,6 +1899,12 @@ static int __init amd_pstate_init(void)
static_call_update(amd_pstate_update_perf, shmem_update_perf);
}
+ ret = amd_pstate_register_driver(cppc_state);
+ if (ret) {
+ pr_err("failed to register with return %d\n", ret);
+ return ret;
+ }
+
if (amd_pstate_prefcore) {
ret = amd_detect_prefcore(&amd_pstate_prefcore);
if (ret)

View File

@@ -0,0 +1,152 @@
From f6e6b4ebc5ef3ad05b291928d31f597210a2eb6e Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Wed, 4 Dec 2024 14:48:38 +0000
Subject: cpufreq/amd-pstate: Convert the amd_pstate_get/set_epp() to static
calls
MSR and shared memory based systems have different mechanisms to get and
set the epp value. Split those mechanisms into different functions and
assign them appropriately to the static calls at boot time. This eliminates
the need for the "if(cpu_feature_enabled(X86_FEATURE_CPPC))" checks at
runtime.
Also, propagate the error code from rdmsrl_on_cpu() and cppc_get_epp_perf()
to *_get_epp()'s caller, instead of returning -EIO unconditionally.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 92 +++++++++++++++++++++++-------------
1 file changed, 60 insertions(+), 32 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -210,26 +210,40 @@ static inline int get_mode_idx_from_str(
static DEFINE_MUTEX(amd_pstate_limits_lock);
static DEFINE_MUTEX(amd_pstate_driver_lock);
-static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
+static s16 msr_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
{
u64 epp;
int ret;
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
- if (!cppc_req_cached) {
- epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
- &cppc_req_cached);
- if (epp)
- return epp;
- }
- epp = (cppc_req_cached >> 24) & 0xFF;
- } else {
- ret = cppc_get_epp_perf(cpudata->cpu, &epp);
+ if (!cppc_req_cached) {
+ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &cppc_req_cached);
if (ret < 0) {
pr_debug("Could not retrieve energy perf value (%d)\n", ret);
- return -EIO;
+ return ret;
}
}
+ epp = (cppc_req_cached >> 24) & 0xFF;
+
+ return (s16)epp;
+}
+
+DEFINE_STATIC_CALL(amd_pstate_get_epp, msr_get_epp);
+
+static inline s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
+{
+ return static_call(amd_pstate_get_epp)(cpudata, cppc_req_cached);
+}
+
+static s16 shmem_get_epp(struct amd_cpudata *cpudata, u64 dummy)
+{
+ u64 epp;
+ int ret;
+
+ ret = cppc_get_epp_perf(cpudata->cpu, &epp);
+ if (ret < 0) {
+ pr_debug("Could not retrieve energy perf value (%d)\n", ret);
+ return ret;
+ }
return (s16)(epp & 0xff);
}
@@ -283,33 +297,45 @@ static inline void amd_pstate_update_per
max_perf, fast_switch);
}
-static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
+static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp)
{
int ret;
- struct cppc_perf_ctrls perf_ctrls;
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
- u64 value = READ_ONCE(cpudata->cppc_req_cached);
+ u64 value = READ_ONCE(cpudata->cppc_req_cached);
- value &= ~GENMASK_ULL(31, 24);
- value |= (u64)epp << 24;
- WRITE_ONCE(cpudata->cppc_req_cached, value);
-
- ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
- if (!ret)
- cpudata->epp_cached = epp;
- } else {
- amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
- cpudata->max_limit_perf, false);
+ value &= ~GENMASK_ULL(31, 24);
+ value |= (u64)epp << 24;
+ WRITE_ONCE(cpudata->cppc_req_cached, value);
- perf_ctrls.energy_perf = epp;
- ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
- if (ret) {
- pr_debug("failed to set energy perf value (%d)\n", ret);
- return ret;
- }
+ ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
+ if (!ret)
cpudata->epp_cached = epp;
+
+ return ret;
+}
+
+DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp);
+
+static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
+{
+ return static_call(amd_pstate_set_epp)(cpudata, epp);
+}
+
+static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp)
+{
+ int ret;
+ struct cppc_perf_ctrls perf_ctrls;
+
+ amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
+ cpudata->max_limit_perf, false);
+
+ perf_ctrls.energy_perf = epp;
+ ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
+ if (ret) {
+ pr_debug("failed to set energy perf value (%d)\n", ret);
+ return ret;
}
+ cpudata->epp_cached = epp;
return ret;
}
@@ -1897,6 +1923,8 @@ static int __init amd_pstate_init(void)
static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable);
static_call_update(amd_pstate_init_perf, shmem_init_perf);
static_call_update(amd_pstate_update_perf, shmem_update_perf);
+ static_call_update(amd_pstate_get_epp, shmem_get_epp);
+ static_call_update(amd_pstate_set_epp, shmem_set_epp);
}
ret = amd_pstate_register_driver(cppc_state);

View File

@@ -1,228 +0,0 @@
From 649d296be0c7f0df6e71b4fca25fdbe75cb3994e Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@natalenko.name>
Date: Thu, 17 Oct 2024 17:03:11 +0200
Subject: amd-pstate-6.11: update setting the minimum frequency to
lowest_nonlinear_freq patchset to v3
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
---
drivers/cpufreq/amd-pstate.c | 67 +++++++++++++++++++++---------------
drivers/cpufreq/amd-pstate.h | 4 +--
drivers/cpufreq/cpufreq.c | 6 +---
include/linux/cpufreq.h | 6 ----
4 files changed, 43 insertions(+), 40 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -557,9 +557,28 @@ cpufreq_policy_put:
cpufreq_cpu_put(policy);
}
-static int amd_pstate_verify(struct cpufreq_policy_data *policy)
+static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
{
- cpufreq_verify_within_cpu_limits(policy);
+ /*
+ * Initialize lower frequency limit (i.e.policy->min) with
+ * lowest_nonlinear_frequency which is the most energy efficient
+ * frequency. Override the initial value set by cpufreq core and
+ * amd-pstate qos_requests.
+ */
+ if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
+ struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu);
+ struct amd_cpudata *cpudata;
+
+ if (!policy)
+ return -EINVAL;
+
+ cpudata = policy->driver_data;
+ policy_data->min = cpudata->lowest_nonlinear_freq;
+ cpufreq_cpu_put(policy);
+ }
+
+ cpufreq_verify_within_cpu_limits(policy_data);
+ pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min);
return 0;
}
@@ -709,7 +728,7 @@ static int amd_pstate_cpu_boost_update(s
policy->max = policy->cpuinfo.max_freq;
if (cppc_state == AMD_PSTATE_PASSIVE) {
- ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq);
+ ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq);
if (ret < 0)
pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
}
@@ -976,17 +995,17 @@ static int amd_pstate_cpu_init(struct cp
ret = amd_pstate_init_perf(cpudata);
if (ret)
- goto free_cpudata;
+ goto free_cpudata1;
amd_pstate_init_prefcore(cpudata);
ret = amd_pstate_init_freq(cpudata);
if (ret)
- goto free_cpudata;
+ goto free_cpudata1;
ret = amd_pstate_init_boost_support(cpudata);
if (ret)
- goto free_cpudata;
+ goto free_cpudata1;
min_freq = READ_ONCE(cpudata->min_freq);
max_freq = READ_ONCE(cpudata->max_freq);
@@ -1008,11 +1027,18 @@ static int amd_pstate_cpu_init(struct cp
if (cpu_feature_enabled(X86_FEATURE_CPPC))
policy->fast_switch_possible = true;
- ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req,
+ ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
+ FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE);
+ if (ret < 0) {
+ dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
+ goto free_cpudata1;
+ }
+
+ ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
if (ret < 0) {
dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
- goto free_cpudata;
+ goto free_cpudata2;
}
cpudata->max_limit_freq = max_freq;
@@ -1025,7 +1051,9 @@ static int amd_pstate_cpu_init(struct cp
return 0;
-free_cpudata:
+free_cpudata2:
+ freq_qos_remove_request(&cpudata->req[0]);
+free_cpudata1:
kfree(cpudata);
return ret;
}
@@ -1034,7 +1062,8 @@ static void amd_pstate_cpu_exit(struct c
{
struct amd_cpudata *cpudata = policy->driver_data;
- freq_qos_remove_request(&cpudata->max_freq_req);
+ freq_qos_remove_request(&cpudata->req[1]);
+ freq_qos_remove_request(&cpudata->req[0]);
policy->fast_switch_possible = false;
kfree(cpudata);
}
@@ -1658,13 +1687,6 @@ static int amd_pstate_epp_cpu_offline(st
return 0;
}
-static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
-{
- cpufreq_verify_within_cpu_limits(policy);
- pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
- return 0;
-}
-
static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
@@ -1703,13 +1725,6 @@ static int amd_pstate_epp_resume(struct
return 0;
}
-static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy)
-{
- struct amd_cpudata *cpudata = policy->driver_data;
-
- return READ_ONCE(cpudata->lowest_nonlinear_freq);
-}
-
static struct cpufreq_driver amd_pstate_driver = {
.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
.verify = amd_pstate_verify,
@@ -1723,12 +1738,11 @@ static struct cpufreq_driver amd_pstate_
.update_limits = amd_pstate_update_limits,
.name = "amd-pstate",
.attr = amd_pstate_attr,
- .get_init_min_freq = amd_pstate_get_init_min_freq,
};
static struct cpufreq_driver amd_pstate_epp_driver = {
.flags = CPUFREQ_CONST_LOOPS,
- .verify = amd_pstate_epp_verify_policy,
+ .verify = amd_pstate_verify,
.setpolicy = amd_pstate_epp_set_policy,
.init = amd_pstate_epp_cpu_init,
.exit = amd_pstate_epp_cpu_exit,
@@ -1740,7 +1754,6 @@ static struct cpufreq_driver amd_pstate_
.set_boost = amd_pstate_set_boost,
.name = "amd-pstate-epp",
.attr = amd_pstate_epp_attr,
- .get_init_min_freq = amd_pstate_get_init_min_freq,
};
static int __init amd_pstate_set_driver(int mode_idx)
--- a/drivers/cpufreq/amd-pstate.h
+++ b/drivers/cpufreq/amd-pstate.h
@@ -28,7 +28,7 @@ struct amd_aperf_mperf {
/**
* struct amd_cpudata - private CPU data for AMD P-State
* @cpu: CPU number
- * @max_freq_req: maximum frequency constraint request to apply
+ * @req: constraint request to apply
* @cppc_req_cached: cached performance request hints
* @highest_perf: the maximum performance an individual processor may reach,
* assuming ideal conditions
@@ -68,7 +68,7 @@ struct amd_aperf_mperf {
struct amd_cpudata {
int cpu;
- struct freq_qos_request max_freq_req;
+ struct freq_qos_request req[2];
u64 cppc_req_cached;
u32 highest_perf;
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1380,7 +1380,6 @@ static int cpufreq_online(unsigned int c
bool new_policy;
unsigned long flags;
unsigned int j;
- u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE;
int ret;
pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
@@ -1465,12 +1464,9 @@ static int cpufreq_online(unsigned int c
goto out_destroy_policy;
}
- if (cpufreq_driver->get_init_min_freq)
- init_min_freq = cpufreq_driver->get_init_min_freq(policy);
-
ret = freq_qos_add_request(&policy->constraints,
policy->min_freq_req, FREQ_QOS_MIN,
- init_min_freq);
+ FREQ_QOS_MIN_DEFAULT_VALUE);
if (ret < 0) {
/*
* So we don't call freq_qos_remove_request() for an
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -414,12 +414,6 @@ struct cpufreq_driver {
* policy is properly initialized, but before the governor is started.
*/
void (*register_em)(struct cpufreq_policy *policy);
-
- /*
- * Set by drivers that want to initialize the policy->min_freq_req with
- * a value different from the default value (0) in cpufreq core.
- */
- int (*get_init_min_freq)(struct cpufreq_policy *policy);
};
/* flags */

View File

@@ -0,0 +1,38 @@
From e73d7f9cb6d8f0e79860788371c88f88057ae34b Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Wed, 4 Dec 2024 14:48:39 +0000
Subject: cpufreq/amd-pstate: Move the invocation of amd_pstate_update_perf()
amd_pstate_update_perf() should not be a part of shmem_set_epp() function,
so move it to the amd_pstate_epp_update_limit() function, where it is needed.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -326,9 +326,6 @@ static int shmem_set_epp(struct amd_cpud
int ret;
struct cppc_perf_ctrls perf_ctrls;
- amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
- cpudata->max_limit_perf, false);
-
perf_ctrls.energy_perf = epp;
ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
if (ret) {
@@ -1628,6 +1625,10 @@ static int amd_pstate_epp_update_limit(s
epp = 0;
WRITE_ONCE(cpudata->cppc_req_cached, value);
+
+ amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
+ cpudata->max_limit_perf, false);
+
return amd_pstate_set_epp(cpudata, epp);
}

View File

@@ -0,0 +1,82 @@
From 021028e977fcd835ed92a4543f7977a8aa0c1dd6 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Wed, 4 Dec 2024 14:48:40 +0000
Subject: cpufreq/amd-pstate: Refactor amd_pstate_epp_reenable() and
amd_pstate_epp_offline()
Replace similar code chunks with amd_pstate_update_perf() and
amd_pstate_set_epp() function calls.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 38 +++++++-----------------------------
1 file changed, 7 insertions(+), 31 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1660,25 +1660,17 @@ static int amd_pstate_epp_set_policy(str
static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
{
- struct cppc_perf_ctrls perf_ctrls;
- u64 value, max_perf;
+ u64 max_perf;
int ret;
ret = amd_pstate_cppc_enable(true);
if (ret)
pr_err("failed to enable amd pstate during resume, return %d\n", ret);
- value = READ_ONCE(cpudata->cppc_req_cached);
max_perf = READ_ONCE(cpudata->highest_perf);
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
- } else {
- perf_ctrls.max_perf = max_perf;
- cppc_set_perf(cpudata->cpu, &perf_ctrls);
- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached);
- cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
- }
+ amd_pstate_update_perf(cpudata, 0, 0, max_perf, false);
+ amd_pstate_set_epp(cpudata, cpudata->epp_cached);
}
static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy)
@@ -1698,31 +1690,15 @@ static int amd_pstate_epp_cpu_online(str
static void amd_pstate_epp_offline(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
- struct cppc_perf_ctrls perf_ctrls;
int min_perf;
- u64 value;
min_perf = READ_ONCE(cpudata->lowest_perf);
- value = READ_ONCE(cpudata->cppc_req_cached);
mutex_lock(&amd_pstate_limits_lock);
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
- cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN;
- /* Set max perf same as min perf */
- value &= ~AMD_CPPC_MAX_PERF(~0L);
- value |= AMD_CPPC_MAX_PERF(min_perf);
- value &= ~AMD_CPPC_MIN_PERF(~0L);
- value |= AMD_CPPC_MIN_PERF(min_perf);
- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
- } else {
- perf_ctrls.desired_perf = 0;
- perf_ctrls.min_perf = min_perf;
- perf_ctrls.max_perf = min_perf;
- cppc_set_perf(cpudata->cpu, &perf_ctrls);
- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);
- cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
- }
+ amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false);
+ amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE);
+
mutex_unlock(&amd_pstate_limits_lock);
}

View File

@@ -0,0 +1,43 @@
From f94f6415c70f76d885dd542e8631f826f427941e Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Wed, 4 Dec 2024 14:48:41 +0000
Subject: cpufreq/amd-pstate: Remove the cppc_state check in offline/online
functions
Only amd_pstate_epp driver (i.e. cppc_state = ACTIVE) enters the
amd_pstate_epp_offline() and amd_pstate_epp_cpu_online() functions,
so remove the unnecessary if condition checking if cppc_state is
equal to AMD_PSTATE_ACTIVE.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1679,10 +1679,8 @@ static int amd_pstate_epp_cpu_online(str
pr_debug("AMD CPU Core %d going online\n", cpudata->cpu);
- if (cppc_state == AMD_PSTATE_ACTIVE) {
- amd_pstate_epp_reenable(cpudata);
- cpudata->suspended = false;
- }
+ amd_pstate_epp_reenable(cpudata);
+ cpudata->suspended = false;
return 0;
}
@@ -1711,8 +1709,7 @@ static int amd_pstate_epp_cpu_offline(st
if (cpudata->suspended)
return 0;
- if (cppc_state == AMD_PSTATE_ACTIVE)
- amd_pstate_epp_offline(policy);
+ amd_pstate_epp_offline(policy);
return 0;
}

View File

@@ -0,0 +1,56 @@
From 5aea3e8c4255cb04876e3af714d58ed329376b7f Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Wed, 4 Dec 2024 14:48:42 +0000
Subject: cpufreq/amd-pstate: Merge amd_pstate_epp_cpu_offline() and
amd_pstate_epp_offline()
amd_pstate_epp_offline() is only called from within
amd_pstate_epp_cpu_offline() and doesn't make much sense to have it at all.
Hence, remove it.
Also remove the unncessary debug print in the offline path while at it.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 17 ++++-------------
1 file changed, 4 insertions(+), 13 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1685,11 +1685,14 @@ static int amd_pstate_epp_cpu_online(str
return 0;
}
-static void amd_pstate_epp_offline(struct cpufreq_policy *policy)
+static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
int min_perf;
+ if (cpudata->suspended)
+ return 0;
+
min_perf = READ_ONCE(cpudata->lowest_perf);
mutex_lock(&amd_pstate_limits_lock);
@@ -1698,18 +1701,6 @@ static void amd_pstate_epp_offline(struc
amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE);
mutex_unlock(&amd_pstate_limits_lock);
-}
-
-static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
-{
- struct amd_cpudata *cpudata = policy->driver_data;
-
- pr_debug("AMD CPU Core %d going offline\n", cpudata->cpu);
-
- if (cpudata->suspended)
- return 0;
-
- amd_pstate_epp_offline(policy);
return 0;
}

View File

@@ -0,0 +1,132 @@
From ea6b500eb38124a59e83254435340e0390117c54 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:33 -0600
Subject: cpufreq/amd-pstate: Add trace event for EPP perf updates
In "active" mode the most important thing for debugging whether
an issue is hardware or software based is to look at what was the
last thing written to the CPPC request MSR or shared memory region.
The 'amd_pstate_epp_perf' trace event shows the values being written
for all CPUs.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate-trace.h | 45 ++++++++++++++++++++++++++++++
drivers/cpufreq/amd-pstate.c | 28 +++++++++++++++++++
2 files changed, 73 insertions(+)
--- a/drivers/cpufreq/amd-pstate-trace.h
+++ b/drivers/cpufreq/amd-pstate-trace.h
@@ -88,6 +88,51 @@ TRACE_EVENT(amd_pstate_perf,
)
);
+TRACE_EVENT(amd_pstate_epp_perf,
+
+ TP_PROTO(unsigned int cpu_id,
+ unsigned int highest_perf,
+ unsigned int epp,
+ unsigned int min_perf,
+ unsigned int max_perf,
+ bool boost
+ ),
+
+ TP_ARGS(cpu_id,
+ highest_perf,
+ epp,
+ min_perf,
+ max_perf,
+ boost),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, cpu_id)
+ __field(unsigned int, highest_perf)
+ __field(unsigned int, epp)
+ __field(unsigned int, min_perf)
+ __field(unsigned int, max_perf)
+ __field(bool, boost)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu_id = cpu_id;
+ __entry->highest_perf = highest_perf;
+ __entry->epp = epp;
+ __entry->min_perf = min_perf;
+ __entry->max_perf = max_perf;
+ __entry->boost = boost;
+ ),
+
+ TP_printk("cpu%u: [%u<->%u]/%u, epp=%u, boost=%u",
+ (unsigned int)__entry->cpu_id,
+ (unsigned int)__entry->min_perf,
+ (unsigned int)__entry->max_perf,
+ (unsigned int)__entry->highest_perf,
+ (unsigned int)__entry->epp,
+ (bool)__entry->boost
+ )
+);
+
#endif /* _AMD_PSTATE_TRACE_H */
/* This part must be outside protection */
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -354,6 +354,14 @@ static int amd_pstate_set_energy_pref_in
return -EBUSY;
}
+ if (trace_amd_pstate_epp_perf_enabled()) {
+ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
+ epp,
+ AMD_CPPC_MIN_PERF(cpudata->cppc_req_cached),
+ AMD_CPPC_MAX_PERF(cpudata->cppc_req_cached),
+ cpudata->boost_state);
+ }
+
ret = amd_pstate_set_epp(cpudata, epp);
return ret;
@@ -1626,6 +1634,13 @@ static int amd_pstate_epp_update_limit(s
WRITE_ONCE(cpudata->cppc_req_cached, value);
+ if (trace_amd_pstate_epp_perf_enabled()) {
+ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp,
+ cpudata->min_limit_perf,
+ cpudata->max_limit_perf,
+ cpudata->boost_state);
+ }
+
amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
cpudata->max_limit_perf, false);
@@ -1669,6 +1684,13 @@ static void amd_pstate_epp_reenable(stru
max_perf = READ_ONCE(cpudata->highest_perf);
+ if (trace_amd_pstate_epp_perf_enabled()) {
+ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
+ cpudata->epp_cached,
+ AMD_CPPC_MIN_PERF(cpudata->cppc_req_cached),
+ max_perf, cpudata->boost_state);
+ }
+
amd_pstate_update_perf(cpudata, 0, 0, max_perf, false);
amd_pstate_set_epp(cpudata, cpudata->epp_cached);
}
@@ -1697,6 +1719,12 @@ static int amd_pstate_epp_cpu_offline(st
mutex_lock(&amd_pstate_limits_lock);
+ if (trace_amd_pstate_epp_perf_enabled()) {
+ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
+ AMD_CPPC_EPP_BALANCE_POWERSAVE,
+ min_perf, min_perf, cpudata->boost_state);
+ }
+
amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false);
amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE);

View File

@@ -0,0 +1,123 @@
From 57fdcf14dcd80808fc46edea6d31bb2a699c0fd1 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:34 -0600
Subject: cpufreq/amd-pstate: convert mutex use to guard()
Using scoped guard declaration will unlock mutexes automatically.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 32 ++++++++++++--------------------
1 file changed, 12 insertions(+), 20 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -782,12 +782,12 @@ static int amd_pstate_set_boost(struct c
pr_err("Boost mode is not supported by this processor or SBIOS\n");
return -EOPNOTSUPP;
}
- mutex_lock(&amd_pstate_driver_lock);
+ guard(mutex)(&amd_pstate_driver_lock);
+
ret = amd_pstate_cpu_boost_update(policy, state);
WRITE_ONCE(cpudata->boost_state, !ret ? state : false);
policy->boost_enabled = !ret ? state : false;
refresh_frequency_limits(policy);
- mutex_unlock(&amd_pstate_driver_lock);
return ret;
}
@@ -878,7 +878,8 @@ static void amd_pstate_update_limits(uns
if (!amd_pstate_prefcore)
return;
- mutex_lock(&amd_pstate_driver_lock);
+ guard(mutex)(&amd_pstate_driver_lock);
+
ret = amd_get_highest_perf(cpu, &cur_high);
if (ret)
goto free_cpufreq_put;
@@ -898,7 +899,6 @@ free_cpufreq_put:
if (!highest_perf_changed)
cpufreq_update_policy(cpu);
- mutex_unlock(&amd_pstate_driver_lock);
}
/*
@@ -1231,11 +1231,11 @@ static ssize_t store_energy_performance_
if (ret < 0)
return -EINVAL;
- mutex_lock(&amd_pstate_limits_lock);
+ guard(mutex)(&amd_pstate_limits_lock);
+
ret = amd_pstate_set_energy_pref_index(cpudata, ret);
- mutex_unlock(&amd_pstate_limits_lock);
- return ret ?: count;
+ return ret ? ret : count;
}
static ssize_t show_energy_performance_preference(
@@ -1399,13 +1399,10 @@ EXPORT_SYMBOL_GPL(amd_pstate_update_stat
static ssize_t status_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- ssize_t ret;
- mutex_lock(&amd_pstate_driver_lock);
- ret = amd_pstate_show_status(buf);
- mutex_unlock(&amd_pstate_driver_lock);
+ guard(mutex)(&amd_pstate_driver_lock);
- return ret;
+ return amd_pstate_show_status(buf);
}
static ssize_t status_store(struct device *a, struct device_attribute *b,
@@ -1414,9 +1411,8 @@ static ssize_t status_store(struct devic
char *p = memchr(buf, '\n', count);
int ret;
- mutex_lock(&amd_pstate_driver_lock);
+ guard(mutex)(&amd_pstate_driver_lock);
ret = amd_pstate_update_status(buf, p ? p - buf : count);
- mutex_unlock(&amd_pstate_driver_lock);
return ret < 0 ? ret : count;
}
@@ -1717,7 +1713,7 @@ static int amd_pstate_epp_cpu_offline(st
min_perf = READ_ONCE(cpudata->lowest_perf);
- mutex_lock(&amd_pstate_limits_lock);
+ guard(mutex)(&amd_pstate_limits_lock);
if (trace_amd_pstate_epp_perf_enabled()) {
trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
@@ -1728,8 +1724,6 @@ static int amd_pstate_epp_cpu_offline(st
amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false);
amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE);
- mutex_unlock(&amd_pstate_limits_lock);
-
return 0;
}
@@ -1758,13 +1752,11 @@ static int amd_pstate_epp_resume(struct
struct amd_cpudata *cpudata = policy->driver_data;
if (cpudata->suspended) {
- mutex_lock(&amd_pstate_limits_lock);
+ guard(mutex)(&amd_pstate_limits_lock);
/* enable amd pstate from suspend state*/
amd_pstate_epp_reenable(cpudata);
- mutex_unlock(&amd_pstate_limits_lock);
-
cpudata->suspended = false;
}

View File

@@ -0,0 +1,52 @@
From 06c52df851256238253ec286682cafbd5bb832d8 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:35 -0600
Subject: cpufreq/amd-pstate: Drop cached epp_policy variable
epp_policy is not used by any of the current code and there
is no need to cache it.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 3 ---
drivers/cpufreq/amd-pstate.h | 2 --
2 files changed, 5 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1506,7 +1506,6 @@ static int amd_pstate_epp_cpu_init(struc
return -ENOMEM;
cpudata->cpu = policy->cpu;
- cpudata->epp_policy = 0;
ret = amd_pstate_init_perf(cpudata);
if (ret)
@@ -1613,8 +1612,6 @@ static int amd_pstate_epp_update_limit(s
value &= ~AMD_CPPC_DES_PERF(~0L);
value |= AMD_CPPC_DES_PERF(0);
- cpudata->epp_policy = cpudata->policy;
-
/* Get BIOS pre-defined epp value */
epp = amd_pstate_get_epp(cpudata, value);
if (epp < 0) {
--- a/drivers/cpufreq/amd-pstate.h
+++ b/drivers/cpufreq/amd-pstate.h
@@ -57,7 +57,6 @@ struct amd_aperf_mperf {
* @hw_prefcore: check whether HW supports preferred core featue.
* Only when hw_prefcore and early prefcore param are true,
* AMD P-State driver supports preferred core featue.
- * @epp_policy: Last saved policy used to set energy-performance preference
* @epp_cached: Cached CPPC energy-performance preference value
* @policy: Cpufreq policy value
* @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value
@@ -94,7 +93,6 @@ struct amd_cpudata {
bool hw_prefcore;
/* EPP feature related attributes*/
- s16 epp_policy;
s16 epp_cached;
u32 policy;
u64 cppc_cap1_cached;

View File

@@ -0,0 +1,117 @@
From da3e84f3dd424c1b7fa6d86484ddcccac391e49c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:36 -0600
Subject: cpufreq/amd-pstate: Use FIELD_PREP and FIELD_GET macros
The FIELD_PREP and FIELD_GET macros improve readability and help
to avoid shifting bugs.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 45 ++++++++++++++++--------------------
1 file changed, 20 insertions(+), 25 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -22,6 +22,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/bitfield.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
@@ -88,6 +89,11 @@ static bool cppc_enabled;
static bool amd_pstate_prefcore = true;
static struct quirk_entry *quirks;
+#define AMD_PSTATE_MAX_PERF_MASK GENMASK(7, 0)
+#define AMD_PSTATE_MIN_PERF_MASK GENMASK(15, 8)
+#define AMD_PSTATE_DES_PERF_MASK GENMASK(23, 16)
+#define AMD_PSTATE_EPP_PERF_MASK GENMASK(31, 24)
+
/*
* AMD Energy Preference Performance (EPP)
* The EPP is used in the CCLK DPM controller to drive
@@ -212,7 +218,6 @@ static DEFINE_MUTEX(amd_pstate_driver_lo
static s16 msr_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
{
- u64 epp;
int ret;
if (!cppc_req_cached) {
@@ -222,9 +227,8 @@ static s16 msr_get_epp(struct amd_cpudat
return ret;
}
}
- epp = (cppc_req_cached >> 24) & 0xFF;
- return (s16)epp;
+ return FIELD_GET(AMD_PSTATE_EPP_PERF_MASK, cppc_req_cached);
}
DEFINE_STATIC_CALL(amd_pstate_get_epp, msr_get_epp);
@@ -299,12 +303,11 @@ static inline void amd_pstate_update_per
static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp)
{
- int ret;
-
u64 value = READ_ONCE(cpudata->cppc_req_cached);
+ int ret;
- value &= ~GENMASK_ULL(31, 24);
- value |= (u64)epp << 24;
+ value &= ~AMD_PSTATE_EPP_PERF_MASK;
+ value |= FIELD_PREP(AMD_PSTATE_EPP_PERF_MASK, epp);
WRITE_ONCE(cpudata->cppc_req_cached, value);
ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
@@ -563,18 +566,15 @@ static void amd_pstate_update(struct amd
des_perf = 0;
}
- value &= ~AMD_CPPC_MIN_PERF(~0L);
- value |= AMD_CPPC_MIN_PERF(min_perf);
-
- value &= ~AMD_CPPC_DES_PERF(~0L);
- value |= AMD_CPPC_DES_PERF(des_perf);
-
/* limit the max perf when core performance boost feature is disabled */
if (!cpudata->boost_supported)
max_perf = min_t(unsigned long, nominal_perf, max_perf);
- value &= ~AMD_CPPC_MAX_PERF(~0L);
- value |= AMD_CPPC_MAX_PERF(max_perf);
+ value &= ~(AMD_PSTATE_MAX_PERF_MASK | AMD_PSTATE_MIN_PERF_MASK |
+ AMD_PSTATE_DES_PERF_MASK);
+ value |= FIELD_PREP(AMD_PSTATE_MAX_PERF_MASK, max_perf);
+ value |= FIELD_PREP(AMD_PSTATE_DES_PERF_MASK, des_perf);
+ value |= FIELD_PREP(AMD_PSTATE_MIN_PERF_MASK, min_perf);
if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) {
trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
@@ -1601,16 +1601,11 @@ static int amd_pstate_epp_update_limit(s
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
min_perf = min(cpudata->nominal_perf, max_perf);
- /* Initial min/max values for CPPC Performance Controls Register */
- value &= ~AMD_CPPC_MIN_PERF(~0L);
- value |= AMD_CPPC_MIN_PERF(min_perf);
-
- value &= ~AMD_CPPC_MAX_PERF(~0L);
- value |= AMD_CPPC_MAX_PERF(max_perf);
-
- /* CPPC EPP feature require to set zero to the desire perf bit */
- value &= ~AMD_CPPC_DES_PERF(~0L);
- value |= AMD_CPPC_DES_PERF(0);
+ value &= ~(AMD_PSTATE_MAX_PERF_MASK | AMD_PSTATE_MIN_PERF_MASK |
+ AMD_PSTATE_DES_PERF_MASK);
+ value |= FIELD_PREP(AMD_PSTATE_MAX_PERF_MASK, max_perf);
+ value |= FIELD_PREP(AMD_PSTATE_DES_PERF_MASK, 0);
+ value |= FIELD_PREP(AMD_PSTATE_MIN_PERF_MASK, min_perf);
/* Get BIOS pre-defined epp value */
epp = amd_pstate_get_epp(cpudata, value);

View File

@@ -0,0 +1,103 @@
From b1e3f18eb18f9febe9f4e32a1ef55cf60c97892a Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:37 -0600
Subject: cpufreq/amd-pstate: Store the boost numerator as highest perf again
commit ad4caad58d91d ("cpufreq: amd-pstate: Merge
amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()")
changed the semantics for highest perf and commit 18d9b52271213
("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled")
worked around those semantic changes.
This however is a confusing result and furthermore makes it awkward to
change frequency limits and boost due to the scaling differences. Restore
the boost numerator to highest perf again.
Suggested-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Fixes: ad4caad58d91 ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()")
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
Documentation/admin-guide/pm/amd-pstate.rst | 4 +---
drivers/cpufreq/amd-pstate.c | 25 ++++++++++++---------
2 files changed, 16 insertions(+), 13 deletions(-)
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -251,9 +251,7 @@ performance supported in `AMD CPPC Perfo
In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
table, so we need to expose it to sysfs. If boost is not active, but
still supported, this maximum frequency will be larger than the one in
-``cpuinfo``. On systems that support preferred core, the driver will have
-different values for some cores than others and this will reflect the values
-advertised by the platform at bootup.
+``cpuinfo``.
This attribute is read-only.
``amd_pstate_lowest_nonlinear_freq``
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -438,15 +438,19 @@ static inline int amd_pstate_cppc_enable
static int msr_init_perf(struct amd_cpudata *cpudata)
{
- u64 cap1;
+ u64 cap1, numerator;
int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
&cap1);
if (ret)
return ret;
- WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
- WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+ ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
+ if (ret)
+ return ret;
+
+ WRITE_ONCE(cpudata->highest_perf, numerator);
+ WRITE_ONCE(cpudata->max_limit_perf, numerator);
WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
@@ -458,13 +462,18 @@ static int msr_init_perf(struct amd_cpud
static int shmem_init_perf(struct amd_cpudata *cpudata)
{
struct cppc_perf_caps cppc_perf;
+ u64 numerator;
int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
if (ret)
return ret;
- WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
- WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf);
+ ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
+ if (ret)
+ return ret;
+
+ WRITE_ONCE(cpudata->highest_perf, numerator);
+ WRITE_ONCE(cpudata->max_limit_perf, numerator);
WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
WRITE_ONCE(cpudata->lowest_nonlinear_perf,
cppc_perf.lowest_nonlinear_perf);
@@ -950,7 +959,6 @@ static int amd_pstate_init_freq(struct a
{
int ret;
u32 min_freq, max_freq;
- u64 numerator;
u32 nominal_perf, nominal_freq;
u32 lowest_nonlinear_perf, lowest_nonlinear_freq;
u32 boost_ratio, lowest_nonlinear_ratio;
@@ -972,10 +980,7 @@ static int amd_pstate_init_freq(struct a
nominal_perf = READ_ONCE(cpudata->nominal_perf);
- ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
- if (ret)
- return ret;
- boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf);
+ boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);

View File

@@ -1,183 +0,0 @@
From 27b069b518f77fffc337286a2fad8a6305daa60f Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 4 Nov 2024 16:28:55 -0600
Subject: ACPI: processor: Move arch_init_invariance_cppc() call later
arch_init_invariance_cppc() is called at the end of
acpi_cppc_processor_probe() in order to configure frequency invariance
based upon the values from _CPC.
This however doesn't work on AMD CPPC shared memory designs that have
AMD preferred cores enabled because _CPC needs to be analyzed from all
cores to judge if preferred cores are enabled.
This issue manifests to users as a warning since commit 21fb59ab4b97
("ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn"):
```
Could not retrieve highest performance (-19)
```
However the warning isn't the cause of this, it was actually
commit 279f838a61f9 ("x86/amd: Detect preferred cores in
amd_get_boost_ratio_numerator()") which exposed the issue.
To fix this problem, change arch_init_invariance_cppc() into a new weak
symbol that is called at the end of acpi_processor_driver_init().
Each architecture that supports it can declare the symbol to override
the weak one.
Fixes: 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()")
Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219431
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
arch/arm64/include/asm/topology.h | 4 ----
arch/x86/include/asm/topology.h | 5 -----
arch/x86/kernel/acpi/cppc.c | 7 ++++++-
drivers/acpi/cppc_acpi.c | 6 ------
drivers/acpi/processor_driver.c | 9 +++++++++
drivers/base/arch_topology.c | 6 +++++-
include/acpi/processor.h | 2 ++
include/linux/arch_topology.h | 4 ----
8 files changed, 22 insertions(+), 21 deletions(-)
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -25,10 +25,6 @@ void update_freq_counters_refs(void);
#define arch_scale_freq_invariant topology_scale_freq_invariant
#define arch_scale_freq_ref topology_get_freq_ref
-#ifdef CONFIG_ACPI_CPPC_LIB
-#define arch_init_invariance_cppc topology_init_cpu_capacity_cppc
-#endif
-
/* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -292,9 +292,4 @@ static inline void freq_invariance_set_p
extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick
-#ifdef CONFIG_ACPI_CPPC_LIB
-void init_freq_invariance_cppc(void);
-#define arch_init_invariance_cppc init_freq_invariance_cppc
-#endif
-
#endif /* _ASM_X86_TOPOLOGY_H */
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -110,7 +110,7 @@ static void amd_set_max_freq_ratio(void)
static DEFINE_MUTEX(freq_invariance_lock);
-void init_freq_invariance_cppc(void)
+static inline void init_freq_invariance_cppc(void)
{
static bool init_done;
@@ -127,6 +127,11 @@ void init_freq_invariance_cppc(void)
mutex_unlock(&freq_invariance_lock);
}
+void acpi_processor_init_invariance_cppc(void)
+{
+ init_freq_invariance_cppc();
+}
+
/*
* Get the highest performance register value.
* @cpu: CPU from which to get highest performance.
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -671,10 +671,6 @@ static int pcc_data_alloc(int pcc_ss_id)
* )
*/
-#ifndef arch_init_invariance_cppc
-static inline void arch_init_invariance_cppc(void) { }
-#endif
-
/**
* acpi_cppc_processor_probe - Search for per CPU _CPC objects.
* @pr: Ptr to acpi_processor containing this CPU's logical ID.
@@ -905,8 +901,6 @@ int acpi_cppc_processor_probe(struct acp
goto out_free;
}
- arch_init_invariance_cppc();
-
kfree(output.pointer);
return 0;
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -237,6 +237,9 @@ static struct notifier_block acpi_proces
.notifier_call = acpi_processor_notifier,
};
+void __weak acpi_processor_init_invariance_cppc(void)
+{ }
+
/*
* We keep the driver loaded even when ACPI is not running.
* This is needed for the powernow-k8 driver, that works even without
@@ -270,6 +273,12 @@ static int __init acpi_processor_driver_
NULL, acpi_soft_cpu_dead);
acpi_processor_throttling_init();
+
+ /*
+ * Frequency invariance calculations on AMD platforms can't be run until
+ * after acpi_cppc_processor_probe() has been called for all online CPUs
+ */
+ acpi_processor_init_invariance_cppc();
return 0;
err:
driver_unregister(&acpi_processor_driver);
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -366,7 +366,7 @@ void __weak freq_inv_set_max_ratio(int c
#ifdef CONFIG_ACPI_CPPC_LIB
#include <acpi/cppc_acpi.h>
-void topology_init_cpu_capacity_cppc(void)
+static inline void topology_init_cpu_capacity_cppc(void)
{
u64 capacity, capacity_scale = 0;
struct cppc_perf_caps perf_caps;
@@ -417,6 +417,10 @@ void topology_init_cpu_capacity_cppc(voi
exit:
free_raw_capacity();
}
+void acpi_processor_init_invariance_cppc(void)
+{
+ topology_init_cpu_capacity_cppc();
+}
#endif
#ifdef CONFIG_CPU_FREQ
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -465,4 +465,6 @@ extern int acpi_processor_ffh_lpi_probe(
extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
#endif
+void acpi_processor_init_invariance_cppc(void);
+
#endif
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -11,10 +11,6 @@
void topology_normalize_cpu_scale(void);
int topology_update_cpu_topology(void);
-#ifdef CONFIG_ACPI_CPPC_LIB
-void topology_init_cpu_capacity_cppc(void);
-#endif
-
struct device_node;
bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);

View File

@@ -0,0 +1,45 @@
From fab64249be63c384484cf82f4e08724577ad3b84 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:38 -0600
Subject: cpufreq/amd-pstate: Use boost numerator for upper bound of
frequencies
commit 18d9b5227121 ("cpufreq/amd-pstate: Use nominal perf for limits
when boost is disabled") introduced different semantics for min/max limits
based upon whether the user turned off boost from sysfs.
This however is not necessary when the highest perf value is the boost
numerator.
Suggested-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Fixes: 18d9b5227121 ("cpufreq/amd-pstate: Use nominal perf for limits when boost is disabled")
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -631,16 +631,13 @@ static int amd_pstate_verify(struct cpuf
static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
{
- u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf;
+ u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq;
struct amd_cpudata *cpudata = policy->driver_data;
- if (cpudata->boost_supported && !policy->boost_enabled)
- max_perf = READ_ONCE(cpudata->nominal_perf);
- else
- max_perf = READ_ONCE(cpudata->highest_perf);
-
- max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
- min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
+ max_perf = READ_ONCE(cpudata->highest_perf);
+ max_freq = READ_ONCE(cpudata->max_freq);
+ max_limit_perf = div_u64(policy->max * max_perf, max_freq);
+ min_limit_perf = div_u64(policy->min * max_perf, max_freq);
lowest_perf = READ_ONCE(cpudata->lowest_perf);
if (min_limit_perf < lowest_perf)

View File

@@ -0,0 +1,36 @@
From a9b210306e861b7c2d3b8532c85e8cd54c3b322a Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:39 -0600
Subject: cpufreq/amd-pstate: Only update the cached value in msr_set_epp() on
success
If writing the MSR MSR_AMD_CPPC_REQ fails then the cached value in the
amd_cpudata structure should not be updated.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -308,11 +308,15 @@ static int msr_set_epp(struct amd_cpudat
value &= ~AMD_PSTATE_EPP_PERF_MASK;
value |= FIELD_PREP(AMD_PSTATE_EPP_PERF_MASK, epp);
- WRITE_ONCE(cpudata->cppc_req_cached, value);
ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
- if (!ret)
- cpudata->epp_cached = epp;
+ if (ret) {
+ pr_err("failed to set energy perf value (%d)\n", ret);
+ return ret;
+ }
+
+ cpudata->epp_cached = epp;
+ WRITE_ONCE(cpudata->cppc_req_cached, value);
return ret;
}

View File

@@ -0,0 +1,134 @@
From 27a3d0642a3d26ba54e8a21575e6d2cb0ada55b3 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:40 -0600
Subject: cpufreq/amd-pstate: store all values in cpudata struct in khz
Storing values in the cpudata structure in different units leads
to confusion and hardcoded conversions elsewhere. After ratios are
calculated store everything in khz for any future use. Adjust all
relevant consumers for this change as well.
Suggested-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate-ut.c | 12 +++++-------
drivers/cpufreq/amd-pstate.c | 30 +++++++++++++++---------------
2 files changed, 20 insertions(+), 22 deletions(-)
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -207,7 +207,6 @@ static void amd_pstate_ut_check_freq(u32
int cpu = 0;
struct cpufreq_policy *policy = NULL;
struct amd_cpudata *cpudata = NULL;
- u32 nominal_freq_khz;
for_each_possible_cpu(cpu) {
policy = cpufreq_cpu_get(cpu);
@@ -215,14 +214,13 @@ static void amd_pstate_ut_check_freq(u32
break;
cpudata = policy->driver_data;
- nominal_freq_khz = cpudata->nominal_freq*1000;
- if (!((cpudata->max_freq >= nominal_freq_khz) &&
- (nominal_freq_khz > cpudata->lowest_nonlinear_freq) &&
+ if (!((cpudata->max_freq >= cpudata->nominal_freq) &&
+ (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) &&
(cpudata->lowest_nonlinear_freq > cpudata->min_freq) &&
(cpudata->min_freq > 0))) {
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n",
- __func__, cpu, cpudata->max_freq, nominal_freq_khz,
+ __func__, cpu, cpudata->max_freq, cpudata->nominal_freq,
cpudata->lowest_nonlinear_freq, cpudata->min_freq);
goto skip_test;
}
@@ -236,13 +234,13 @@ static void amd_pstate_ut_check_freq(u32
if (cpudata->boost_supported) {
if ((policy->max == cpudata->max_freq) ||
- (policy->max == nominal_freq_khz))
+ (policy->max == cpudata->nominal_freq))
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
else {
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n",
__func__, cpu, policy->max, cpudata->max_freq,
- nominal_freq_khz);
+ cpudata->nominal_freq);
goto skip_test;
}
} else {
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -769,8 +769,8 @@ static int amd_pstate_cpu_boost_update(s
if (on)
policy->cpuinfo.max_freq = max_freq;
- else if (policy->cpuinfo.max_freq > nominal_freq * 1000)
- policy->cpuinfo.max_freq = nominal_freq * 1000;
+ else if (policy->cpuinfo.max_freq > nominal_freq)
+ policy->cpuinfo.max_freq = nominal_freq;
policy->max = policy->cpuinfo.max_freq;
@@ -970,29 +970,29 @@ static int amd_pstate_init_freq(struct a
return ret;
if (quirks && quirks->lowest_freq)
- min_freq = quirks->lowest_freq * 1000;
+ min_freq = quirks->lowest_freq;
else
- min_freq = cppc_perf.lowest_freq * 1000;
+ min_freq = cppc_perf.lowest_freq;
if (quirks && quirks->nominal_freq)
- nominal_freq = quirks->nominal_freq ;
+ nominal_freq = quirks->nominal_freq;
else
nominal_freq = cppc_perf.nominal_freq;
nominal_perf = READ_ONCE(cpudata->nominal_perf);
boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
- max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
+ max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT);
lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT,
nominal_perf);
- lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
+ lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT);
- WRITE_ONCE(cpudata->min_freq, min_freq);
- WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq);
- WRITE_ONCE(cpudata->nominal_freq, nominal_freq);
- WRITE_ONCE(cpudata->max_freq, max_freq);
+ WRITE_ONCE(cpudata->min_freq, min_freq * 1000);
+ WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq * 1000);
+ WRITE_ONCE(cpudata->nominal_freq, nominal_freq * 1000);
+ WRITE_ONCE(cpudata->max_freq, max_freq * 1000);
/**
* Below values need to be initialized correctly, otherwise driver will fail to load
@@ -1000,15 +1000,15 @@ static int amd_pstate_init_freq(struct a
* lowest_nonlinear_freq is a value between [min_freq, nominal_freq]
* Check _CPC in ACPI table objects if any values are incorrect
*/
- if (min_freq <= 0 || max_freq <= 0 || nominal_freq <= 0 || min_freq > max_freq) {
+ if (min_freq <= 0 || max_freq <= 0 || cpudata->nominal_freq <= 0 || min_freq > max_freq) {
pr_err("min_freq(%d) or max_freq(%d) or nominal_freq(%d) value is incorrect\n",
- min_freq, max_freq, nominal_freq * 1000);
+ min_freq, max_freq, cpudata->nominal_freq);
return -EINVAL;
}
- if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > nominal_freq * 1000) {
+ if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > cpudata->nominal_freq) {
pr_err("lowest_nonlinear_freq(%d) value is out of range [min_freq(%d), nominal_freq(%d)]\n",
- lowest_nonlinear_freq, min_freq, nominal_freq * 1000);
+ lowest_nonlinear_freq, min_freq, cpudata->nominal_freq);
return -EINVAL;
}

View File

@@ -0,0 +1,69 @@
From b2a0f625fa30dc907daf1b07bf94b15872da096b Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:41 -0600
Subject: cpufreq/amd-pstate: Change amd_pstate_update_perf() to return an int
As msr_update_perf() calls an MSR it's possible that it fails. Pass
this return code up to the caller.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -281,24 +281,26 @@ static int amd_pstate_get_energy_pref_in
return index;
}
-static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
+static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
u32 des_perf, u32 max_perf, bool fast_switch)
{
- if (fast_switch)
+ if (fast_switch) {
wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached));
- else
- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
- READ_ONCE(cpudata->cppc_req_cached));
+ return 0;
+ }
+
+ return wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
+ READ_ONCE(cpudata->cppc_req_cached));
}
DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf);
-static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata,
+static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata,
u32 min_perf, u32 des_perf,
u32 max_perf, bool fast_switch)
{
- static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf,
- max_perf, fast_switch);
+ return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf,
+ max_perf, fast_switch);
}
static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp)
@@ -510,7 +512,7 @@ static inline int amd_pstate_init_perf(s
return static_call(amd_pstate_init_perf)(cpudata);
}
-static void shmem_update_perf(struct amd_cpudata *cpudata,
+static int shmem_update_perf(struct amd_cpudata *cpudata,
u32 min_perf, u32 des_perf,
u32 max_perf, bool fast_switch)
{
@@ -520,7 +522,7 @@ static void shmem_update_perf(struct amd
perf_ctrls.min_perf = min_perf;
perf_ctrls.desired_perf = des_perf;
- cppc_set_perf(cpudata->cpu, &perf_ctrls);
+ return cppc_set_perf(cpudata->cpu, &perf_ctrls);
}
static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)

View File

@@ -0,0 +1,85 @@
From 8399d57cda79a662800a4cd01bd7db951355f451 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:42 -0600
Subject: cpufreq/amd-pstate: Move limit updating code
The limit updating code in amd_pstate_epp_update_limit() should not
only apply to EPP updates. Move it to amd_pstate_update_min_max_limit()
so other callers can benefit as well.
With this move it's not necessary to have clamp_t calls anymore because
the verify callback is called when setting limits.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 28 +++++-----------------------
1 file changed, 5 insertions(+), 23 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -567,10 +567,6 @@ static void amd_pstate_update(struct amd
u32 nominal_perf = READ_ONCE(cpudata->nominal_perf);
u64 value = prev;
- min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf,
- cpudata->max_limit_perf);
- max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
- cpudata->max_limit_perf);
des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
max_freq = READ_ONCE(cpudata->max_limit_freq);
@@ -637,7 +633,7 @@ static int amd_pstate_verify(struct cpuf
static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
{
- u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq;
+ u32 max_limit_perf, min_limit_perf, max_perf, max_freq;
struct amd_cpudata *cpudata = policy->driver_data;
max_perf = READ_ONCE(cpudata->highest_perf);
@@ -645,12 +641,8 @@ static int amd_pstate_update_min_max_lim
max_limit_perf = div_u64(policy->max * max_perf, max_freq);
min_limit_perf = div_u64(policy->min * max_perf, max_freq);
- lowest_perf = READ_ONCE(cpudata->lowest_perf);
- if (min_limit_perf < lowest_perf)
- min_limit_perf = lowest_perf;
-
- if (max_limit_perf < min_limit_perf)
- max_limit_perf = min_limit_perf;
+ if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
+ min_limit_perf = min(cpudata->nominal_perf, max_limit_perf);
WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
@@ -1592,28 +1584,18 @@ static void amd_pstate_epp_cpu_exit(stru
static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
- u32 max_perf, min_perf;
u64 value;
s16 epp;
- max_perf = READ_ONCE(cpudata->highest_perf);
- min_perf = READ_ONCE(cpudata->lowest_perf);
amd_pstate_update_min_max_limit(policy);
- max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
- cpudata->max_limit_perf);
- min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf,
- cpudata->max_limit_perf);
value = READ_ONCE(cpudata->cppc_req_cached);
- if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
- min_perf = min(cpudata->nominal_perf, max_perf);
-
value &= ~(AMD_PSTATE_MAX_PERF_MASK | AMD_PSTATE_MIN_PERF_MASK |
AMD_PSTATE_DES_PERF_MASK);
- value |= FIELD_PREP(AMD_PSTATE_MAX_PERF_MASK, max_perf);
+ value |= FIELD_PREP(AMD_PSTATE_MAX_PERF_MASK, cpudata->max_limit_perf);
value |= FIELD_PREP(AMD_PSTATE_DES_PERF_MASK, 0);
- value |= FIELD_PREP(AMD_PSTATE_MIN_PERF_MASK, min_perf);
+ value |= FIELD_PREP(AMD_PSTATE_MIN_PERF_MASK, cpudata->min_limit_perf);
/* Get BIOS pre-defined epp value */
epp = amd_pstate_get_epp(cpudata, value);

View File

@@ -0,0 +1,233 @@
From fe7f993593ee1891acc875dfb1e091cf17a43f7a Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:43 -0600
Subject: cpufreq/amd-pstate: Cache EPP value and use that everywhere
Cache the value in cpudata->epp_cached, and use that for all callers.
As all callers use cached value merge amd_pstate_get_energy_pref_index()
into show_energy_performance_preference().
Check if the EPP value is changed before writing it to MSR or
shared memory region.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 107 +++++++++++++++--------------------
1 file changed, 45 insertions(+), 62 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -216,29 +216,28 @@ static inline int get_mode_idx_from_str(
static DEFINE_MUTEX(amd_pstate_limits_lock);
static DEFINE_MUTEX(amd_pstate_driver_lock);
-static s16 msr_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
+static s16 msr_get_epp(struct amd_cpudata *cpudata)
{
+ u64 value;
int ret;
- if (!cppc_req_cached) {
- ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &cppc_req_cached);
- if (ret < 0) {
- pr_debug("Could not retrieve energy perf value (%d)\n", ret);
- return ret;
- }
+ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
+ if (ret < 0) {
+ pr_debug("Could not retrieve energy perf value (%d)\n", ret);
+ return ret;
}
- return FIELD_GET(AMD_PSTATE_EPP_PERF_MASK, cppc_req_cached);
+ return FIELD_GET(AMD_PSTATE_EPP_PERF_MASK, value);
}
DEFINE_STATIC_CALL(amd_pstate_get_epp, msr_get_epp);
-static inline s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
+static inline s16 amd_pstate_get_epp(struct amd_cpudata *cpudata)
{
- return static_call(amd_pstate_get_epp)(cpudata, cppc_req_cached);
+ return static_call(amd_pstate_get_epp)(cpudata);
}
-static s16 shmem_get_epp(struct amd_cpudata *cpudata, u64 dummy)
+static s16 shmem_get_epp(struct amd_cpudata *cpudata)
{
u64 epp;
int ret;
@@ -252,35 +251,6 @@ static s16 shmem_get_epp(struct amd_cpud
return (s16)(epp & 0xff);
}
-static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata)
-{
- s16 epp;
- int index = -EINVAL;
-
- epp = amd_pstate_get_epp(cpudata, 0);
- if (epp < 0)
- return epp;
-
- switch (epp) {
- case AMD_CPPC_EPP_PERFORMANCE:
- index = EPP_INDEX_PERFORMANCE;
- break;
- case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
- index = EPP_INDEX_BALANCE_PERFORMANCE;
- break;
- case AMD_CPPC_EPP_BALANCE_POWERSAVE:
- index = EPP_INDEX_BALANCE_POWERSAVE;
- break;
- case AMD_CPPC_EPP_POWERSAVE:
- index = EPP_INDEX_POWERSAVE;
- break;
- default:
- break;
- }
-
- return index;
-}
-
static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
u32 des_perf, u32 max_perf, bool fast_switch)
{
@@ -305,19 +275,23 @@ static inline int amd_pstate_update_perf
static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp)
{
- u64 value = READ_ONCE(cpudata->cppc_req_cached);
+ u64 value, prev;
int ret;
+ value = prev = READ_ONCE(cpudata->cppc_req_cached);
value &= ~AMD_PSTATE_EPP_PERF_MASK;
value |= FIELD_PREP(AMD_PSTATE_EPP_PERF_MASK, epp);
+ if (value == prev)
+ return 0;
+
ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
if (ret) {
pr_err("failed to set energy perf value (%d)\n", ret);
return ret;
}
- cpudata->epp_cached = epp;
+ WRITE_ONCE(cpudata->epp_cached, epp);
WRITE_ONCE(cpudata->cppc_req_cached, value);
return ret;
@@ -335,13 +309,16 @@ static int shmem_set_epp(struct amd_cpud
int ret;
struct cppc_perf_ctrls perf_ctrls;
+ if (epp == cpudata->epp_cached)
+ return 0;
+
perf_ctrls.energy_perf = epp;
ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
if (ret) {
pr_debug("failed to set energy perf value (%d)\n", ret);
return ret;
}
- cpudata->epp_cached = epp;
+ WRITE_ONCE(cpudata->epp_cached, epp);
return ret;
}
@@ -1244,9 +1221,22 @@ static ssize_t show_energy_performance_p
struct amd_cpudata *cpudata = policy->driver_data;
int preference;
- preference = amd_pstate_get_energy_pref_index(cpudata);
- if (preference < 0)
- return preference;
+ switch (cpudata->epp_cached) {
+ case AMD_CPPC_EPP_PERFORMANCE:
+ preference = EPP_INDEX_PERFORMANCE;
+ break;
+ case AMD_CPPC_EPP_BALANCE_PERFORMANCE:
+ preference = EPP_INDEX_BALANCE_PERFORMANCE;
+ break;
+ case AMD_CPPC_EPP_BALANCE_POWERSAVE:
+ preference = EPP_INDEX_BALANCE_POWERSAVE;
+ break;
+ case AMD_CPPC_EPP_POWERSAVE:
+ preference = EPP_INDEX_POWERSAVE;
+ break;
+ default:
+ return -EINVAL;
+ }
return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]);
}
@@ -1531,7 +1521,7 @@ static int amd_pstate_epp_cpu_init(struc
policy->driver_data = cpudata;
- cpudata->epp_cached = cpudata->epp_default = amd_pstate_get_epp(cpudata, 0);
+ cpudata->epp_cached = cpudata->epp_default = amd_pstate_get_epp(cpudata);
policy->min = policy->cpuinfo.min_freq;
policy->max = policy->cpuinfo.max_freq;
@@ -1585,35 +1575,26 @@ static int amd_pstate_epp_update_limit(s
{
struct amd_cpudata *cpudata = policy->driver_data;
u64 value;
- s16 epp;
amd_pstate_update_min_max_limit(policy);
value = READ_ONCE(cpudata->cppc_req_cached);
value &= ~(AMD_PSTATE_MAX_PERF_MASK | AMD_PSTATE_MIN_PERF_MASK |
- AMD_PSTATE_DES_PERF_MASK);
+ AMD_PSTATE_DES_PERF_MASK | AMD_PSTATE_EPP_PERF_MASK);
value |= FIELD_PREP(AMD_PSTATE_MAX_PERF_MASK, cpudata->max_limit_perf);
value |= FIELD_PREP(AMD_PSTATE_DES_PERF_MASK, 0);
value |= FIELD_PREP(AMD_PSTATE_MIN_PERF_MASK, cpudata->min_limit_perf);
- /* Get BIOS pre-defined epp value */
- epp = amd_pstate_get_epp(cpudata, value);
- if (epp < 0) {
- /**
- * This return value can only be negative for shared_memory
- * systems where EPP register read/write not supported.
- */
- return epp;
- }
-
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
- epp = 0;
+ WRITE_ONCE(cpudata->epp_cached, 0);
+ value |= FIELD_PREP(AMD_PSTATE_EPP_PERF_MASK, cpudata->epp_cached);
WRITE_ONCE(cpudata->cppc_req_cached, value);
if (trace_amd_pstate_epp_perf_enabled()) {
- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp,
+ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
+ cpudata->epp_cached,
cpudata->min_limit_perf,
cpudata->max_limit_perf,
cpudata->boost_state);
@@ -1622,7 +1603,7 @@ static int amd_pstate_epp_update_limit(s
amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
cpudata->max_limit_perf, false);
- return amd_pstate_set_epp(cpudata, epp);
+ return amd_pstate_set_epp(cpudata, READ_ONCE(cpudata->epp_cached));
}
static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
@@ -1638,6 +1619,8 @@ static int amd_pstate_epp_set_policy(str
cpudata->policy = policy->policy;
+ guard(mutex)(&amd_pstate_limits_lock);
+
ret = amd_pstate_epp_update_limit(policy);
if (ret)
return ret;

View File

@@ -0,0 +1,182 @@
From 98f0f9202cd0fc549f5beaaaf8750658d8ee2140 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:44 -0600
Subject: cpufreq/amd-pstate: Always write EPP value when updating perf
For MSR systems the EPP value is in the same register as perf targets
and so divding them into two separate MSR writes is wasteful.
In msr_update_perf(), update both EPP and perf values in one write to
MSR_AMD_CPPC_REQ, and cache them if successful.
To accomplish this plumb the EPP value into the update_perf call and modify
all its callers to check the return value.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 71 ++++++++++++++++++++++--------------
1 file changed, 43 insertions(+), 28 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -252,25 +252,36 @@ static s16 shmem_get_epp(struct amd_cpud
}
static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
- u32 des_perf, u32 max_perf, bool fast_switch)
+ u32 des_perf, u32 max_perf, u32 epp, bool fast_switch)
{
+ u64 value;
+
+ value = READ_ONCE(cpudata->cppc_req_cached);
if (fast_switch) {
wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached));
return 0;
+ } else {
+ int ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
+ READ_ONCE(cpudata->cppc_req_cached));
+ if (ret)
+ return ret;
}
- return wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
- READ_ONCE(cpudata->cppc_req_cached));
+ WRITE_ONCE(cpudata->cppc_req_cached, value);
+ WRITE_ONCE(cpudata->epp_cached, epp);
+
+ return 0;
}
DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf);
static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata,
u32 min_perf, u32 des_perf,
- u32 max_perf, bool fast_switch)
+ u32 max_perf, u32 epp,
+ bool fast_switch)
{
return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf,
- max_perf, fast_switch);
+ max_perf, epp, fast_switch);
}
static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp)
@@ -489,12 +500,19 @@ static inline int amd_pstate_init_perf(s
return static_call(amd_pstate_init_perf)(cpudata);
}
-static int shmem_update_perf(struct amd_cpudata *cpudata,
- u32 min_perf, u32 des_perf,
- u32 max_perf, bool fast_switch)
+static int shmem_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
+ u32 des_perf, u32 max_perf, u32 epp, bool fast_switch)
{
struct cppc_perf_ctrls perf_ctrls;
+ if (cppc_state == AMD_PSTATE_ACTIVE) {
+ int ret = shmem_set_epp(cpudata, epp);
+
+ if (ret)
+ return ret;
+ WRITE_ONCE(cpudata->epp_cached, epp);
+ }
+
perf_ctrls.max_perf = max_perf;
perf_ctrls.min_perf = min_perf;
perf_ctrls.desired_perf = des_perf;
@@ -575,10 +593,10 @@ static void amd_pstate_update(struct amd
WRITE_ONCE(cpudata->cppc_req_cached, value);
- amd_pstate_update_perf(cpudata, min_perf, des_perf,
- max_perf, fast_switch);
+ amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch);
cpufreq_policy_put:
+
cpufreq_cpu_put(policy);
}
@@ -1575,6 +1593,7 @@ static int amd_pstate_epp_update_limit(s
{
struct amd_cpudata *cpudata = policy->driver_data;
u64 value;
+ u32 epp;
amd_pstate_update_min_max_limit(policy);
@@ -1587,23 +1606,19 @@ static int amd_pstate_epp_update_limit(s
value |= FIELD_PREP(AMD_PSTATE_MIN_PERF_MASK, cpudata->min_limit_perf);
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
- WRITE_ONCE(cpudata->epp_cached, 0);
- value |= FIELD_PREP(AMD_PSTATE_EPP_PERF_MASK, cpudata->epp_cached);
-
- WRITE_ONCE(cpudata->cppc_req_cached, value);
+ epp = 0;
+ else
+ epp = READ_ONCE(cpudata->epp_cached);
if (trace_amd_pstate_epp_perf_enabled()) {
- trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf,
- cpudata->epp_cached,
+ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp,
cpudata->min_limit_perf,
cpudata->max_limit_perf,
cpudata->boost_state);
}
- amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
- cpudata->max_limit_perf, false);
-
- return amd_pstate_set_epp(cpudata, READ_ONCE(cpudata->epp_cached));
+ return amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U,
+ cpudata->max_limit_perf, epp, false);
}
static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
@@ -1634,7 +1649,7 @@ static int amd_pstate_epp_set_policy(str
return 0;
}
-static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
+static int amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
{
u64 max_perf;
int ret;
@@ -1652,17 +1667,19 @@ static void amd_pstate_epp_reenable(stru
max_perf, cpudata->boost_state);
}
- amd_pstate_update_perf(cpudata, 0, 0, max_perf, false);
- amd_pstate_set_epp(cpudata, cpudata->epp_cached);
+ return amd_pstate_update_perf(cpudata, 0, 0, max_perf, cpudata->epp_cached, false);
}
static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
+ int ret;
pr_debug("AMD CPU Core %d going online\n", cpudata->cpu);
- amd_pstate_epp_reenable(cpudata);
+ ret = amd_pstate_epp_reenable(cpudata);
+ if (ret)
+ return ret;
cpudata->suspended = false;
return 0;
@@ -1686,10 +1703,8 @@ static int amd_pstate_epp_cpu_offline(st
min_perf, min_perf, cpudata->boost_state);
}
- amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false);
- amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE);
-
- return 0;
+ return amd_pstate_update_perf(cpudata, min_perf, 0, min_perf,
+ AMD_CPPC_EPP_BALANCE_POWERSAVE, false);
}
static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)

View File

@@ -0,0 +1,158 @@
From acf3f432287638044e472501a1d5969abee15043 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:45 -0600
Subject: cpufreq/amd-pstate: Check if CPPC request has changed before writing
to the MSR or shared memory
Move the common MSR field formatting code to msr_update_perf() from
its callers.
Ensure that the MSR write is necessary before flushing a write out.
Also drop the comparison from the passive flow tracing.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate-trace.h | 7 +----
drivers/cpufreq/amd-pstate.c | 47 +++++++++++-------------------
2 files changed, 18 insertions(+), 36 deletions(-)
--- a/drivers/cpufreq/amd-pstate-trace.h
+++ b/drivers/cpufreq/amd-pstate-trace.h
@@ -32,7 +32,6 @@ TRACE_EVENT(amd_pstate_perf,
u64 aperf,
u64 tsc,
unsigned int cpu_id,
- bool changed,
bool fast_switch
),
@@ -44,7 +43,6 @@ TRACE_EVENT(amd_pstate_perf,
aperf,
tsc,
cpu_id,
- changed,
fast_switch
),
@@ -57,7 +55,6 @@ TRACE_EVENT(amd_pstate_perf,
__field(unsigned long long, aperf)
__field(unsigned long long, tsc)
__field(unsigned int, cpu_id)
- __field(bool, changed)
__field(bool, fast_switch)
),
@@ -70,11 +67,10 @@ TRACE_EVENT(amd_pstate_perf,
__entry->aperf = aperf;
__entry->tsc = tsc;
__entry->cpu_id = cpu_id;
- __entry->changed = changed;
__entry->fast_switch = fast_switch;
),
- TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u changed=%s fast_switch=%s",
+ TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u fast_switch=%s",
(unsigned long)__entry->min_perf,
(unsigned long)__entry->target_perf,
(unsigned long)__entry->capacity,
@@ -83,7 +79,6 @@ TRACE_EVENT(amd_pstate_perf,
(unsigned long long)__entry->aperf,
(unsigned long long)__entry->tsc,
(unsigned int)__entry->cpu_id,
- (__entry->changed) ? "true" : "false",
(__entry->fast_switch) ? "true" : "false"
)
);
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -254,15 +254,26 @@ static s16 shmem_get_epp(struct amd_cpud
static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
u32 des_perf, u32 max_perf, u32 epp, bool fast_switch)
{
- u64 value;
+ u64 value, prev;
+
+ value = prev = READ_ONCE(cpudata->cppc_req_cached);
+
+ value &= ~(AMD_PSTATE_MAX_PERF_MASK | AMD_PSTATE_MIN_PERF_MASK |
+ AMD_PSTATE_DES_PERF_MASK | AMD_PSTATE_EPP_PERF_MASK);
+ value |= FIELD_PREP(AMD_PSTATE_MAX_PERF_MASK, max_perf);
+ value |= FIELD_PREP(AMD_PSTATE_DES_PERF_MASK, des_perf);
+ value |= FIELD_PREP(AMD_PSTATE_MIN_PERF_MASK, min_perf);
+ value |= FIELD_PREP(AMD_PSTATE_EPP_PERF_MASK, epp);
+
+ if (value == prev)
+ return 0;
- value = READ_ONCE(cpudata->cppc_req_cached);
if (fast_switch) {
- wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached));
+ wrmsrl(MSR_AMD_CPPC_REQ, value);
return 0;
} else {
- int ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
- READ_ONCE(cpudata->cppc_req_cached));
+ int ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
+
if (ret)
return ret;
}
@@ -558,9 +569,7 @@ static void amd_pstate_update(struct amd
{
unsigned long max_freq;
struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu);
- u64 prev = READ_ONCE(cpudata->cppc_req_cached);
u32 nominal_perf = READ_ONCE(cpudata->nominal_perf);
- u64 value = prev;
des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
@@ -576,27 +585,14 @@ static void amd_pstate_update(struct amd
if (!cpudata->boost_supported)
max_perf = min_t(unsigned long, nominal_perf, max_perf);
- value &= ~(AMD_PSTATE_MAX_PERF_MASK | AMD_PSTATE_MIN_PERF_MASK |
- AMD_PSTATE_DES_PERF_MASK);
- value |= FIELD_PREP(AMD_PSTATE_MAX_PERF_MASK, max_perf);
- value |= FIELD_PREP(AMD_PSTATE_DES_PERF_MASK, des_perf);
- value |= FIELD_PREP(AMD_PSTATE_MIN_PERF_MASK, min_perf);
-
if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) {
trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq,
cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc,
- cpudata->cpu, (value != prev), fast_switch);
+ cpudata->cpu, fast_switch);
}
- if (value == prev)
- goto cpufreq_policy_put;
-
- WRITE_ONCE(cpudata->cppc_req_cached, value);
-
amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch);
-cpufreq_policy_put:
-
cpufreq_cpu_put(policy);
}
@@ -1592,19 +1588,10 @@ static void amd_pstate_epp_cpu_exit(stru
static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
- u64 value;
u32 epp;
amd_pstate_update_min_max_limit(policy);
- value = READ_ONCE(cpudata->cppc_req_cached);
-
- value &= ~(AMD_PSTATE_MAX_PERF_MASK | AMD_PSTATE_MIN_PERF_MASK |
- AMD_PSTATE_DES_PERF_MASK | AMD_PSTATE_EPP_PERF_MASK);
- value |= FIELD_PREP(AMD_PSTATE_MAX_PERF_MASK, cpudata->max_limit_perf);
- value |= FIELD_PREP(AMD_PSTATE_DES_PERF_MASK, 0);
- value |= FIELD_PREP(AMD_PSTATE_MIN_PERF_MASK, cpudata->min_limit_perf);
-
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
epp = 0;
else

View File

@@ -0,0 +1,42 @@
From 55bcae033b08b1b926f927616cca17242cfcfe59 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:46 -0600
Subject: cpufreq/amd-pstate: Drop ret variable from
amd_pstate_set_energy_pref_index()
The ret variable is not necessary.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -348,13 +348,11 @@ static int shmem_set_epp(struct amd_cpud
static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
int pref_index)
{
- int epp = -EINVAL;
- int ret;
+ int epp;
if (!pref_index)
epp = cpudata->epp_default;
-
- if (epp == -EINVAL)
+ else
epp = epp_values[pref_index];
if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) {
@@ -370,9 +368,7 @@ static int amd_pstate_set_energy_pref_in
cpudata->boost_state);
}
- ret = amd_pstate_set_epp(cpudata, epp);
-
- return ret;
+ return amd_pstate_set_epp(cpudata, epp);
}
static inline int msr_cppc_enable(bool enable)

View File

@@ -0,0 +1,58 @@
From f1edcca6a96f087f5862a14012fafd4eb9738601 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Dec 2024 16:28:47 -0600
Subject: cpufreq/amd-pstate: Set different default EPP policy for Epyc and
Ryzen
For Ryzen systems the EPP policy set by the BIOS is generally configured
to performance as this is the default register value for the CPPC request
MSR.
If a user doesn't use additional software to configure EPP then the system
will default biased towards performance and consume extra battery. Instead
configure the default to "balanced_performance" for this case.
Suggested-by: Artem S. Tashkinov <aros@gmx.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219526
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1531,8 +1531,6 @@ static int amd_pstate_epp_cpu_init(struc
policy->driver_data = cpudata;
- cpudata->epp_cached = cpudata->epp_default = amd_pstate_get_epp(cpudata);
-
policy->min = policy->cpuinfo.min_freq;
policy->max = policy->cpuinfo.max_freq;
@@ -1543,10 +1541,13 @@ static int amd_pstate_epp_cpu_init(struc
* the default cpufreq governor is neither powersave nor performance.
*/
if (amd_pstate_acpi_pm_profile_server() ||
- amd_pstate_acpi_pm_profile_undefined())
+ amd_pstate_acpi_pm_profile_undefined()) {
policy->policy = CPUFREQ_POLICY_PERFORMANCE;
- else
+ cpudata->epp_default = amd_pstate_get_epp(cpudata);
+ } else {
policy->policy = CPUFREQ_POLICY_POWERSAVE;
+ cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE;
+ }
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
@@ -1559,6 +1560,9 @@ static int amd_pstate_epp_cpu_init(struc
return ret;
WRITE_ONCE(cpudata->cppc_cap1_cached, value);
}
+ ret = amd_pstate_set_epp(cpudata, cpudata->epp_default);
+ if (ret)
+ return ret;
current_pstate_driver->adjust_perf = NULL;

View File

@@ -1,321 +0,0 @@
From 023d6b8aa8d8b346cfdcccf5ca4cb880c8d41d87 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:37 -0700
Subject: perf: Generic hotplug support for a PMU with a scope
The perf subsystem assumes that the counters of a PMU are per-CPU. So
the user space tool reads a counter from each CPU in the system wide
mode. However, many PMUs don't have a per-CPU counter. The counter is
effective for a scope, e.g., a die or a socket. To address this, a
cpumask is exposed by the kernel driver to restrict to one CPU to stand
for a specific scope. In case the given CPU is removed,
the hotplug support has to be implemented for each such driver.
The codes to support the cpumask and hotplug are very similar.
- Expose a cpumask into sysfs
- Pickup another CPU in the same scope if the given CPU is removed.
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
- In event init, always set the CPU in the cpumask to event->cpu
Similar duplicated codes are implemented for each such PMU driver. It
would be good to introduce a generic infrastructure to avoid such
duplication.
5 popular scopes are implemented here, core, die, cluster, pkg, and
the system-wide. The scope can be set when a PMU is registered. If so, a
"cpumask" is automatically exposed for the PMU.
The "cpumask" is from the perf_online_<scope>_mask, which is to track
the active CPU for each scope. They are set when the first CPU of the
scope is online via the generic perf hotplug support. When a
corresponding CPU is removed, the perf_online_<scope>_mask is updated
accordingly and the PMU will be moved to a new CPU from the same scope
if possible.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
include/linux/perf_event.h | 18 ++++
kernel/events/core.c | 164 ++++++++++++++++++++++++++++++++++++-
2 files changed, 180 insertions(+), 2 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -292,6 +292,19 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
+/**
+ * pmu::scope
+ */
+enum perf_pmu_scope {
+ PERF_PMU_SCOPE_NONE = 0,
+ PERF_PMU_SCOPE_CORE,
+ PERF_PMU_SCOPE_DIE,
+ PERF_PMU_SCOPE_CLUSTER,
+ PERF_PMU_SCOPE_PKG,
+ PERF_PMU_SCOPE_SYS_WIDE,
+ PERF_PMU_MAX_SCOPE,
+};
+
struct perf_output_handle;
#define PMU_NULL_DEV ((void *)(~0UL))
@@ -315,6 +328,11 @@ struct pmu {
*/
int capabilities;
+ /*
+ * PMU scope
+ */
+ unsigned int scope;
+
int __percpu *pmu_disable_count;
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -411,6 +411,11 @@ static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
/*
@@ -11497,10 +11502,60 @@ perf_event_mux_interval_ms_store(struct
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return topology_sibling_cpumask(cpu);
+ case PERF_PMU_SCOPE_DIE:
+ return topology_die_cpumask(cpu);
+ case PERF_PMU_SCOPE_CLUSTER:
+ return topology_cluster_cpumask(cpu);
+ case PERF_PMU_SCOPE_PKG:
+ return topology_core_cpumask(cpu);
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return cpu_online_mask;
+ }
+
+ return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return perf_online_core_mask;
+ case PERF_PMU_SCOPE_DIE:
+ return perf_online_die_mask;
+ case PERF_PMU_SCOPE_CLUSTER:
+ return perf_online_cluster_mask;
+ case PERF_PMU_SCOPE_PKG:
+ return perf_online_pkg_mask;
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return perf_online_sys_mask;
+ }
+
+ return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+ if (mask)
+ return cpumap_print_to_pagebuf(true, buf, mask);
+ return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
&dev_attr_nr_addr_filters.attr,
+ &dev_attr_cpumask.attr,
NULL,
};
@@ -11512,6 +11567,10 @@ static umode_t pmu_dev_is_visible(struct
if (n == 2 && !pmu->nr_addr_filters)
return 0;
+ /* cpumask */
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+ return 0;
+
return a->mode;
}
@@ -11596,6 +11655,11 @@ int perf_pmu_register(struct pmu *pmu, c
goto free_pdc;
}
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;
if (type >= 0)
@@ -11750,6 +11814,22 @@ static int perf_try_init_event(struct pm
event_has_any_exclude_flag(event))
ret = -EINVAL;
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
+ int cpu;
+
+ if (pmu_cpumask && cpumask) {
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ ret = -ENODEV;
+ else
+ event->cpu = cpu;
+ } else {
+ ret = -ENODEV;
+ }
+ }
+
if (ret && event->destroy)
event->destroy(event);
}
@@ -13713,6 +13793,12 @@ static void __init perf_event_init_all_c
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
@@ -13762,6 +13848,40 @@ static void __perf_event_exit_context(vo
raw_spin_unlock(&ctx->lock);
}
+static void perf_event_clear_cpumask(unsigned int cpu)
+{
+ int target[PERF_PMU_MAX_SCOPE];
+ unsigned int scope;
+ struct pmu *pmu;
+
+ cpumask_clear_cpu(cpu, perf_online_mask);
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+ target[scope] = -1;
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+ continue;
+ target[scope] = cpumask_any_but(cpumask, cpu);
+ if (target[scope] < nr_cpu_ids)
+ cpumask_set_cpu(target[scope], pmu_cpumask);
+ }
+
+ /* migrate */
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+ continue;
+
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+ }
+}
+
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13769,6 +13889,11 @@ static void perf_event_exit_cpu_context(
// XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
+ /*
+ * Clear the cpumasks, and migrate to other CPUs if possible.
+ * Must be invoked before the __perf_event_exit_context.
+ */
+ perf_event_clear_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -13776,7 +13901,6 @@ static void perf_event_exit_cpu_context(
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
- cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
#else
@@ -13785,6 +13909,42 @@ static void perf_event_exit_cpu_context(
#endif
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+ struct cpumask *pmu_cpumask;
+ unsigned int scope;
+
+ cpumask_set_cpu(cpu, perf_online_mask);
+
+ /*
+ * Early boot stage, the cpumask hasn't been set yet.
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
+ * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
+ */
+ if (!topology_sibling_cpumask(cpu)) {
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ pmu_cpumask = perf_scope_cpumask(scope);
+ if (WARN_ON_ONCE(!pmu_cpumask))
+ continue;
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+ return;
+ }
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+ pmu_cpumask = perf_scope_cpumask(scope);
+
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_empty(cpumask) &&
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+}
+
int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13793,7 +13953,7 @@ int perf_event_init_cpu(unsigned int cpu
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
- cpumask_set_cpu(cpu, perf_online_mask);
+ perf_event_setup_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;

View File

@@ -0,0 +1,87 @@
From 997fe7115fcb9392f31aa07f407675194452dc0a Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 10 Oct 2024 07:26:03 -0700
Subject: perf/x86/rapl: Move the pmu allocation out of CPU hotplug
There are extra codes in the CPU hotplug function to allocate rapl pmus.
The generic PMU hotplug support is hard to be applied.
As long as the rapl pmus can be allocated upfront for each die/socket,
the code doesn't need to be implemented in the CPU hotplug function.
Move the code to the init_rapl_pmus(), and allocate a PMU for each
possible die/socket.
Tested-by: Oliver Sang <oliver.sang@intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 44 ++++++++++++++++++++++++++++--------------
1 file changed, 30 insertions(+), 14 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -602,19 +602,8 @@ static int rapl_cpu_online(unsigned int
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
int target;
- if (!pmu) {
- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
- if (!pmu)
- return -ENOMEM;
-
- raw_spin_lock_init(&pmu->lock);
- INIT_LIST_HEAD(&pmu->active_list);
- pmu->pmu = &rapl_pmus->pmu;
- pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
- rapl_hrtimer_init(pmu);
-
- rapl_pmus->pmus[rapl_pmu_idx] = pmu;
- }
+ if (!pmu)
+ return -ENOMEM;
/*
* Check if there is an online cpu in the package which collects rapl
@@ -707,6 +696,32 @@ static const struct attribute_group *rap
NULL,
};
+static int __init init_rapl_pmu(void)
+{
+ struct rapl_pmu *pmu;
+ int idx;
+
+ for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
+ pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
+ if (!pmu)
+ goto free;
+
+ raw_spin_lock_init(&pmu->lock);
+ INIT_LIST_HEAD(&pmu->active_list);
+ pmu->pmu = &rapl_pmus->pmu;
+ pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ rapl_hrtimer_init(pmu);
+
+ rapl_pmus->pmus[idx] = pmu;
+ }
+
+ return 0;
+free:
+ for (; idx > 0; idx--)
+ kfree(rapl_pmus->pmus[idx - 1]);
+ return -ENOMEM;
+}
+
static int __init init_rapl_pmus(void)
{
int nr_rapl_pmu = topology_max_packages();
@@ -730,7 +745,8 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.read = rapl_pmu_event_read;
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
- return 0;
+
+ return init_rapl_pmu();
}
static struct rapl_model model_snb = {

View File

@@ -1,71 +0,0 @@
From 8c7eb17e722a6a45c4436e5debb9336089b21d9b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:38 -0700
Subject: perf: Add PERF_EV_CAP_READ_SCOPE
Usually, an event can be read from any CPU of the scope. It doesn't need
to be read from the advertised CPU.
Add a new event cap, PERF_EV_CAP_READ_SCOPE. An event of a PMU with
scope can be read from any active CPU in the scope.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
include/linux/perf_event.h | 3 +++
kernel/events/core.c | 14 +++++++++++---
2 files changed, 14 insertions(+), 3 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -633,10 +633,13 @@ typedef void (*perf_overflow_handler_t)(
* PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
* cannot be a group leader. If an event with this flag is detached from the
* group it is scheduled out and moved into an unrecoverable ERROR state.
+ * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
+ * PMU scope where it is active.
*/
#define PERF_EV_CAP_SOFTWARE BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1)
#define PERF_EV_CAP_SIBLING BIT(2)
+#define PERF_EV_CAP_READ_SCOPE BIT(3)
#define SWEVENT_HLIST_BITS 8
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4477,16 +4477,24 @@ struct perf_read_data {
int ret;
};
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
+
static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
+ int local_cpu = smp_processor_id();
u16 local_pkg, event_pkg;
if ((unsigned)event_cpu >= nr_cpu_ids)
return event_cpu;
- if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
- int local_cpu = smp_processor_id();
+ if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
+
+ if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
+ return local_cpu;
+ }
+ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
event_pkg = topology_physical_package_id(event_cpu);
local_pkg = topology_physical_package_id(local_cpu);
@@ -11824,7 +11832,7 @@ static int perf_try_init_event(struct pm
if (cpu >= nr_cpu_ids)
ret = -ENODEV;
else
- event->cpu = cpu;
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
} else {
ret = -ENODEV;
}

View File

@@ -1,6 +1,6 @@
From 7b4f6ba1b1dc5f3120652bcb5921a697d5167bff Mon Sep 17 00:00:00 2001
From 423bd96e0e4522b19e2dca083b70ebbebb639acd Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:43 -0700
Date: Thu, 10 Oct 2024 07:26:04 -0700
Subject: perf/x86/rapl: Clean up cpumask and hotplug
The rapl pmu is die scope, which is supported by the generic perf_event
@@ -9,16 +9,17 @@ subsystem now.
Set the scope for the rapl PMU and remove all the cpumask and hotplug
codes.
Tested-by: Oliver Sang <oliver.sang@intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 80 +-------------------------------------
arch/x86/events/rapl.c | 90 +++-----------------------------------
include/linux/cpuhotplug.h | 1 -
2 files changed, 2 insertions(+), 79 deletions(-)
2 files changed, 6 insertions(+), 85 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -135,7 +135,6 @@ struct rapl_model {
@@ -148,7 +148,6 @@ struct rapl_model {
/* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus;
@@ -26,7 +27,7 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
@@ -340,8 +339,6 @@ static int rapl_pmu_event_init(struct pe
@@ -369,8 +368,6 @@ static int rapl_pmu_event_init(struct pe
if (event->cpu < 0)
return -EINVAL;
@@ -35,7 +36,7 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
return -EINVAL;
@@ -360,7 +357,6 @@ static int rapl_pmu_event_init(struct pe
@@ -389,7 +386,6 @@ static int rapl_pmu_event_init(struct pe
pmu = cpu_to_rapl_pmu(event->cpu);
if (!pmu)
return -EINVAL;
@@ -43,7 +44,7 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
event->pmu_private = pmu;
event->hw.event_base = rapl_msrs[bit].msr;
event->hw.config = cfg;
@@ -374,23 +370,6 @@ static void rapl_pmu_event_read(struct p
@@ -403,23 +399,6 @@ static void rapl_pmu_event_read(struct p
rapl_event_update(event);
}
@@ -67,7 +68,7 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
@@ -438,7 +417,6 @@ static struct attribute_group rapl_pmu_f
@@ -467,7 +446,6 @@ static struct attribute_group rapl_pmu_f
};
static const struct attribute_group *rapl_attr_groups[] = {
@@ -75,7 +76,7 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
@@ -541,49 +519,6 @@ static struct perf_msr amd_rapl_msrs[] =
@@ -570,54 +548,6 @@ static struct perf_msr amd_rapl_msrs[] =
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
@@ -90,7 +91,7 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
-
- pmu->cpu = -1;
- /* Find a new cpu to collect rapl events */
- target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
- target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu);
-
- /* Migrate rapl events to the new target */
- if (target < nr_cpu_ids) {
@@ -103,6 +104,11 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
-
-static int rapl_cpu_online(unsigned int cpu)
-{
- s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu);
- if (rapl_pmu_idx < 0) {
- pr_err("topology_logical_(package/die)_id() returned a negative value");
- return -EINVAL;
- }
- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
- int target;
-
@@ -113,7 +119,7 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
- * Check if there is an online cpu in the package which collects rapl
- * events already.
- */
- target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
- target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu));
- if (target < nr_cpu_ids)
- return 0;
-
@@ -125,15 +131,29 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
static int rapl_check_hw_unit(struct rapl_model *rm)
{
u64 msr_rapl_power_unit_bits;
@@ -707,6 +642,7 @@ static int __init init_rapl_pmus(void)
@@ -725,9 +655,12 @@ free:
static int __init init_rapl_pmus(void)
{
int nr_rapl_pmu = topology_max_packages();
+ int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
- if (!rapl_pmu_is_pkg_scope())
+ if (!rapl_pmu_is_pkg_scope()) {
nr_rapl_pmu *= topology_max_dies_per_package();
+ rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+ }
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
@@ -743,6 +676,7 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.start = rapl_pmu_event_start;
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
rapl_pmus->pmu.read = rapl_pmu_event_read;
+ rapl_pmus->pmu.scope = rapl_pmu_scope;
rapl_pmus->pmu.module = THIS_MODULE;
+ rapl_pmus->pmu.scope = PERF_PMU_SCOPE_DIE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
init_rapl_pmu();
@@ -857,24 +793,13 @@ static int __init rapl_pmu_init(void)
@@ -892,24 +826,13 @@ static int __init rapl_pmu_init(void)
if (ret)
return ret;
@@ -159,7 +179,7 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
cleanup_rapl_pmus();
@@ -884,7 +809,6 @@ module_init(rapl_pmu_init);
@@ -919,7 +842,6 @@ module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
@@ -169,7 +189,7 @@ Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
}
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -207,7 +207,6 @@ enum cpuhp_state {
@@ -208,7 +208,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,

View File

@@ -1,286 +0,0 @@
From 09c1529eb102b486220c35546f2663ca858a2943 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:39 -0700
Subject: perf/x86/intel/cstate: Clean up cpumask and hotplug
There are three cstate PMUs with different scopes, core, die and module.
The scopes are supported by the generic perf_event subsystem now.
Set the scope for each PMU and remove all the cpumask and hotplug codes.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/intel/cstate.c | 142 ++-------------------------------
include/linux/cpuhotplug.h | 2 -
2 files changed, 5 insertions(+), 139 deletions(-)
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -128,10 +128,6 @@ static ssize_t __cstate_##_var##_show(st
static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
- struct device_attribute *attr,
- char *buf);
-
/* Model -> events mapping */
struct cstate_model {
unsigned long core_events;
@@ -206,22 +202,9 @@ static struct attribute_group cstate_for
.attrs = cstate_format_attrs,
};
-static cpumask_t cstate_core_cpu_mask;
-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
-
-static struct attribute *cstate_cpumask_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
- .attrs = cstate_cpumask_attrs,
-};
-
static const struct attribute_group *cstate_attr_groups[] = {
&cstate_events_attr_group,
&cstate_format_attr_group,
- &cpumask_attr_group,
NULL,
};
@@ -269,8 +252,6 @@ static struct perf_msr pkg_msr[] = {
[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr },
};
-static cpumask_t cstate_pkg_cpu_mask;
-
/* cstate_module PMU */
static struct pmu cstate_module_pmu;
static bool has_cstate_module;
@@ -291,28 +272,9 @@ static struct perf_msr module_msr[] = {
[PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr },
};
-static cpumask_t cstate_module_cpu_mask;
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- struct pmu *pmu = dev_get_drvdata(dev);
-
- if (pmu == &cstate_core_pmu)
- return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
- else if (pmu == &cstate_pkg_pmu)
- return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
- else if (pmu == &cstate_module_pmu)
- return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask);
- else
- return 0;
-}
-
static int cstate_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config;
- int cpu;
if (event->attr.type != event->pmu->type)
return -ENOENT;
@@ -331,20 +293,13 @@ static int cstate_pmu_event_init(struct
if (!(core_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = core_msr[cfg].msr;
- cpu = cpumask_any_and(&cstate_core_cpu_mask,
- topology_sibling_cpumask(event->cpu));
} else if (event->pmu == &cstate_pkg_pmu) {
if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
return -EINVAL;
cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
if (!(pkg_msr_mask & (1 << cfg)))
return -EINVAL;
-
- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
-
event->hw.event_base = pkg_msr[cfg].msr;
- cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
- topology_die_cpumask(event->cpu));
} else if (event->pmu == &cstate_module_pmu) {
if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
return -EINVAL;
@@ -352,16 +307,10 @@ static int cstate_pmu_event_init(struct
if (!(module_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = module_msr[cfg].msr;
- cpu = cpumask_any_and(&cstate_module_cpu_mask,
- topology_cluster_cpumask(event->cpu));
} else {
return -ENOENT;
}
- if (cpu >= nr_cpu_ids)
- return -ENODEV;
-
- event->cpu = cpu;
event->hw.config = cfg;
event->hw.idx = -1;
return 0;
@@ -412,84 +361,6 @@ static int cstate_pmu_event_add(struct p
return 0;
}
-/*
- * Check if exiting cpu is the designated reader. If so migrate the
- * events when there is a valid target available
- */
-static int cstate_cpu_exit(unsigned int cpu)
-{
- unsigned int target;
-
- if (has_cstate_core &&
- cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {
-
- target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
- /* Migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &cstate_core_cpu_mask);
- perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
- }
- }
-
- if (has_cstate_pkg &&
- cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {
-
- target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
- /* Migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
- perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
- }
- }
-
- if (has_cstate_module &&
- cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) {
-
- target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu);
- /* Migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &cstate_module_cpu_mask);
- perf_pmu_migrate_context(&cstate_module_pmu, cpu, target);
- }
- }
- return 0;
-}
-
-static int cstate_cpu_init(unsigned int cpu)
-{
- unsigned int target;
-
- /*
- * If this is the first online thread of that core, set it in
- * the core cpu mask as the designated reader.
- */
- target = cpumask_any_and(&cstate_core_cpu_mask,
- topology_sibling_cpumask(cpu));
-
- if (has_cstate_core && target >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
-
- /*
- * If this is the first online thread of that package, set it
- * in the package cpu mask as the designated reader.
- */
- target = cpumask_any_and(&cstate_pkg_cpu_mask,
- topology_die_cpumask(cpu));
- if (has_cstate_pkg && target >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
-
- /*
- * If this is the first online thread of that cluster, set it
- * in the cluster cpu mask as the designated reader.
- */
- target = cpumask_any_and(&cstate_module_cpu_mask,
- topology_cluster_cpumask(cpu));
- if (has_cstate_module && target >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_module_cpu_mask);
-
- return 0;
-}
-
static const struct attribute_group *core_attr_update[] = {
&group_cstate_core_c1,
&group_cstate_core_c3,
@@ -526,6 +397,7 @@ static struct pmu cstate_core_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_CORE,
.module = THIS_MODULE,
};
@@ -541,6 +413,7 @@ static struct pmu cstate_pkg_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_PKG,
.module = THIS_MODULE,
};
@@ -556,6 +429,7 @@ static struct pmu cstate_module_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_CLUSTER,
.module = THIS_MODULE,
};
@@ -810,9 +684,6 @@ static int __init cstate_probe(const str
static inline void cstate_cleanup(void)
{
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE);
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING);
-
if (has_cstate_core)
perf_pmu_unregister(&cstate_core_pmu);
@@ -827,11 +698,6 @@ static int __init cstate_init(void)
{
int err;
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING,
- "perf/x86/cstate:starting", cstate_cpu_init, NULL);
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE,
- "perf/x86/cstate:online", NULL, cstate_cpu_exit);
-
if (has_cstate_core) {
err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
if (err) {
@@ -844,6 +710,8 @@ static int __init cstate_init(void)
if (has_cstate_pkg) {
if (topology_max_dies_per_package() > 1) {
+ /* CLX-AP is multi-die and the cstate is die-scope */
+ cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE;
err = perf_pmu_register(&cstate_pkg_pmu,
"cstate_die", -1);
} else {
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -152,7 +152,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
CPUHP_AP_PERF_X86_STARTING,
CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
- CPUHP_AP_PERF_X86_CSTATE_STARTING,
CPUHP_AP_PERF_XTENSA_STARTING,
CPUHP_AP_ARM_VFP_STARTING,
CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
@@ -209,7 +208,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
CPUHP_AP_PERF_X86_RAPL_ONLINE,
- CPUHP_AP_PERF_X86_CSTATE_ONLINE,
CPUHP_AP_PERF_S390_CF_ONLINE,
CPUHP_AP_PERF_S390_SF_ONLINE,
CPUHP_AP_PERF_ARM_CCI_ONLINE,

View File

@@ -0,0 +1,40 @@
From 9e3190a0f75324a7dcb66da336584918243b2c77 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 15 Nov 2024 06:07:57 +0000
Subject: perf/x86/rapl: Remove the unused get_rapl_pmu_cpumask() function
commit 9e9af8bbb5f9 ("perf/x86/rapl: Clean up cpumask and hotplug")
removes the cpumask handling from rapl. Post that, we no longer need the
get_rapl_pmu_cpumask() function. So remove it.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
---
arch/x86/events/rapl.c | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -153,7 +153,7 @@ static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
/*
- * Helper functions to get the correct topology macros according to the
+ * Helper function to get the correct topology id according to the
* RAPL PMU scope.
*/
static inline unsigned int get_rapl_pmu_idx(int cpu)
@@ -162,12 +162,6 @@ static inline unsigned int get_rapl_pmu_
topology_logical_die_id(cpu);
}
-static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
-{
- return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
- topology_die_cpumask(cpu);
-}
-
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);

View File

@@ -1,188 +0,0 @@
From f91da33af8295b4b3d73a2083225f69e1d5ff301 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:40 -0700
Subject: iommu/vt-d: Clean up cpumask and hotplug for perfmon
The iommu PMU is system-wide scope, which is supported by the generic
perf_event subsystem now.
Set the scope for the iommu PMU and remove all the cpumask and hotplug
codes.
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Cc: iommu@lists.linux.dev
---
drivers/iommu/intel/iommu.h | 2 -
drivers/iommu/intel/perfmon.c | 111 +---------------------------------
2 files changed, 2 insertions(+), 111 deletions(-)
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -687,8 +687,6 @@ struct iommu_pmu {
DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
struct perf_event *event_list[IOMMU_PMU_IDX_MAX];
unsigned char irq_name[16];
- struct hlist_node cpuhp_node;
- int cpu;
};
#define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED)
--- a/drivers/iommu/intel/perfmon.c
+++ b/drivers/iommu/intel/perfmon.c
@@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_
.attrs = attrs_empty,
};
-static cpumask_t iommu_pmu_cpu_mask;
-
-static ssize_t
-cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
- return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
-}
-static DEVICE_ATTR_RO(cpumask);
-
-static struct attribute *iommu_pmu_cpumask_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL
-};
-
-static struct attribute_group iommu_pmu_cpumask_attr_group = {
- .attrs = iommu_pmu_cpumask_attrs,
-};
-
static const struct attribute_group *iommu_pmu_attr_groups[] = {
&iommu_pmu_format_attr_group,
&iommu_pmu_events_attr_group,
- &iommu_pmu_cpumask_attr_group,
NULL
};
@@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct i
iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups;
iommu_pmu->pmu.attr_update = iommu_pmu_attr_update;
iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+ iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE;
iommu_pmu->pmu.module = THIS_MODULE;
return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
@@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(st
iommu->perf_irq = 0;
}
-static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
-{
- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
-
- if (cpumask_empty(&iommu_pmu_cpu_mask))
- cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
-
- if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
- iommu_pmu->cpu = cpu;
-
- return 0;
-}
-
-static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
-{
- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
- int target = cpumask_first(&iommu_pmu_cpu_mask);
-
- /*
- * The iommu_pmu_cpu_mask has been updated when offline the CPU
- * for the first iommu_pmu. Migrate the other iommu_pmu to the
- * new target.
- */
- if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
- iommu_pmu->cpu = target;
- return 0;
- }
-
- if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
- return 0;
-
- target = cpumask_any_but(cpu_online_mask, cpu);
-
- if (target < nr_cpu_ids)
- cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
- else
- return 0;
-
- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
- iommu_pmu->cpu = target;
-
- return 0;
-}
-
-static int nr_iommu_pmu;
-static enum cpuhp_state iommu_cpuhp_slot;
-
-static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
-{
- int ret;
-
- if (!nr_iommu_pmu) {
- ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
- "driver/iommu/intel/perfmon:online",
- iommu_pmu_cpu_online,
- iommu_pmu_cpu_offline);
- if (ret < 0)
- return ret;
- iommu_cpuhp_slot = ret;
- }
-
- ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
- if (ret) {
- if (!nr_iommu_pmu)
- cpuhp_remove_multi_state(iommu_cpuhp_slot);
- return ret;
- }
- nr_iommu_pmu++;
-
- return 0;
-}
-
-static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
-{
- cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
-
- if (--nr_iommu_pmu)
- return;
-
- cpuhp_remove_multi_state(iommu_cpuhp_slot);
-}
-
void iommu_pmu_register(struct intel_iommu *iommu)
{
struct iommu_pmu *iommu_pmu = iommu->pmu;
@@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iom
if (__iommu_pmu_register(iommu))
goto err;
- if (iommu_pmu_cpuhp_setup(iommu_pmu))
- goto unregister;
-
/* Set interrupt for overflow */
if (iommu_pmu_set_interrupt(iommu))
- goto cpuhp_free;
+ goto unregister;
return;
-cpuhp_free:
- iommu_pmu_cpuhp_free(iommu_pmu);
unregister:
perf_pmu_unregister(&iommu_pmu->pmu);
err:
@@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_i
return;
iommu_pmu_unset_interrupt(iommu);
- iommu_pmu_cpuhp_free(iommu_pmu);
perf_pmu_unregister(&iommu_pmu->pmu);
}

View File

@@ -1,19 +1,21 @@
From 9439067951f4d857272836b35812af26650d9c16 Mon Sep 17 00:00:00 2001
From 947046055803695b05b1021893860c50412a8d7b Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Fri, 13 Sep 2024 15:21:41 +0000
Date: Fri, 15 Nov 2024 06:07:58 +0000
Subject: x86/topology: Introduce topology_logical_core_id()
On x86, topology_core_id() returns a unique core ID within the PKG
domain. Looking at match_smt() suggests that a core ID just needs to be
unique within a LLC domain. For use cases such as the per-core RAPL PMU,
unique within a LLC domain. For use cases such as the core RAPL PMU,
there exists a need for a unique core ID across the entire system with
multiple PKG domains. Introduce topology_logical_core_id() to derive a
unique core ID across the system.
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
Documentation/arch/x86/topology.rst | 4 ++++
arch/x86/include/asm/processor.h | 1 +
@@ -47,7 +49,7 @@ Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
u32 amd_node_id;
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -137,6 +137,7 @@ extern const struct cpumask *cpu_cluster
@@ -143,6 +143,7 @@ extern const struct cpumask *cpu_cluster
#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
@@ -57,8 +59,8 @@ Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
#define topology_ppin(cpu) (cpu_data(cpu).ppin)
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -24,6 +24,7 @@ static int cpu_debug_show(struct seq_fil
seq_printf(m, "core_id: %u\n", c->topo.core_id);
@@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_fil
seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
@@ -67,7 +69,7 @@ Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
--- a/arch/x86/kernel/cpu/topology_common.c
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -151,6 +151,7 @@ static void topo_set_ids(struct topo_sca
@@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_sca
if (!early) {
c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);

View File

@@ -1,238 +0,0 @@
From 76278bd3946d618ead2d9cc22612a75a4ab99ace Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:41 -0700
Subject: dmaengine: idxd: Clean up cpumask and hotplug for perfmon
The idxd PMU is system-wide scope, which is supported by the generic
perf_event subsystem now.
Set the scope for the idxd PMU and remove all the cpumask and hotplug
codes.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vinod Koul <vkoul@kernel.org>
Cc: dmaengine@vger.kernel.org
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
---
drivers/dma/idxd/idxd.h | 7 ---
drivers/dma/idxd/init.c | 3 --
drivers/dma/idxd/perfmon.c | 98 +-------------------------------------
3 files changed, 1 insertion(+), 107 deletions(-)
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -124,7 +124,6 @@ struct idxd_pmu {
struct pmu pmu;
char name[IDXD_NAME_SIZE];
- int cpu;
int n_counters;
int counter_width;
@@ -135,8 +134,6 @@ struct idxd_pmu {
unsigned long supported_filters;
int n_filters;
-
- struct hlist_node cpuhp_node;
};
#define IDXD_MAX_PRIORITY 0xf
@@ -803,14 +800,10 @@ void idxd_user_counter_increment(struct
int perfmon_pmu_init(struct idxd_device *idxd);
void perfmon_pmu_remove(struct idxd_device *idxd);
void perfmon_counter_overflow(struct idxd_device *idxd);
-void perfmon_init(void);
-void perfmon_exit(void);
#else
static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; }
static inline void perfmon_pmu_remove(struct idxd_device *idxd) {}
static inline void perfmon_counter_overflow(struct idxd_device *idxd) {}
-static inline void perfmon_init(void) {}
-static inline void perfmon_exit(void) {}
#endif
/* debugfs */
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -878,8 +878,6 @@ static int __init idxd_init_module(void)
else
support_enqcmd = true;
- perfmon_init();
-
err = idxd_driver_register(&idxd_drv);
if (err < 0)
goto err_idxd_driver_register;
@@ -928,7 +926,6 @@ static void __exit idxd_exit_module(void
idxd_driver_unregister(&idxd_drv);
pci_unregister_driver(&idxd_pci_driver);
idxd_cdev_remove();
- perfmon_exit();
idxd_remove_debugfs();
}
module_exit(idxd_exit_module);
--- a/drivers/dma/idxd/perfmon.c
+++ b/drivers/dma/idxd/perfmon.c
@@ -6,29 +6,6 @@
#include "idxd.h"
#include "perfmon.h"
-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
- char *buf);
-
-static cpumask_t perfmon_dsa_cpu_mask;
-static bool cpuhp_set_up;
-static enum cpuhp_state cpuhp_slot;
-
-/*
- * perf userspace reads this attribute to determine which cpus to open
- * counters on. It's connected to perfmon_dsa_cpu_mask, which is
- * maintained by the cpu hotplug handlers.
- */
-static DEVICE_ATTR_RO(cpumask);
-
-static struct attribute *perfmon_cpumask_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
- .attrs = perfmon_cpumask_attrs,
-};
-
/*
* These attributes specify the bits in the config word that the perf
* syscall uses to pass the event ids and categories to perfmon.
@@ -67,16 +44,9 @@ static struct attribute_group perfmon_fo
static const struct attribute_group *perfmon_attr_groups[] = {
&perfmon_format_attr_group,
- &cpumask_attr_group,
NULL,
};
-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask);
-}
-
static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event)
{
return &idxd_pmu->pmu == event->pmu;
@@ -217,7 +187,6 @@ static int perfmon_pmu_event_init(struct
return -EINVAL;
event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd));
- event->cpu = idxd->idxd_pmu->cpu;
event->hw.config = event->attr.config;
if (event->group_leader != event)
@@ -488,6 +457,7 @@ static void idxd_pmu_init(struct idxd_pm
idxd_pmu->pmu.stop = perfmon_pmu_event_stop;
idxd_pmu->pmu.read = perfmon_pmu_event_update;
idxd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+ idxd_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE;
idxd_pmu->pmu.module = THIS_MODULE;
}
@@ -496,47 +466,11 @@ void perfmon_pmu_remove(struct idxd_devi
if (!idxd->idxd_pmu)
return;
- cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node);
perf_pmu_unregister(&idxd->idxd_pmu->pmu);
kfree(idxd->idxd_pmu);
idxd->idxd_pmu = NULL;
}
-static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node)
-{
- struct idxd_pmu *idxd_pmu;
-
- idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
-
- /* select the first online CPU as the designated reader */
- if (cpumask_empty(&perfmon_dsa_cpu_mask)) {
- cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask);
- idxd_pmu->cpu = cpu;
- }
-
- return 0;
-}
-
-static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node)
-{
- struct idxd_pmu *idxd_pmu;
- unsigned int target;
-
- idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
-
- if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask))
- return 0;
-
- target = cpumask_any_but(cpu_online_mask, cpu);
- /* migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &perfmon_dsa_cpu_mask);
- perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target);
- }
-
- return 0;
-}
-
int perfmon_pmu_init(struct idxd_device *idxd)
{
union idxd_perfcap perfcap;
@@ -544,12 +478,6 @@ int perfmon_pmu_init(struct idxd_device
int rc = -ENODEV;
/*
- * perfmon module initialization failed, nothing to do
- */
- if (!cpuhp_set_up)
- return -ENODEV;
-
- /*
* If perfmon_offset or num_counters is 0, it means perfmon is
* not supported on this hardware.
*/
@@ -624,11 +552,6 @@ int perfmon_pmu_init(struct idxd_device
if (rc)
goto free;
- rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node);
- if (rc) {
- perf_pmu_unregister(&idxd->idxd_pmu->pmu);
- goto free;
- }
out:
return rc;
free:
@@ -637,22 +560,3 @@ free:
goto out;
}
-
-void __init perfmon_init(void)
-{
- int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
- "driver/dma/idxd/perf:online",
- perf_event_cpu_online,
- perf_event_cpu_offline);
- if (WARN_ON(rc < 0))
- return;
-
- cpuhp_slot = rc;
- cpuhp_set_up = true;
-}
-
-void __exit perfmon_exit(void)
-{
- if (cpuhp_set_up)
- cpuhp_remove_multi_state(cpuhp_slot);
-}

View File

@@ -0,0 +1,69 @@
From 8fd2da09b0c534e05a7b12eb578afae27ad20f7d Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 15 Nov 2024 06:07:59 +0000
Subject: perf/x86/rapl: Remove the cpu_to_rapl_pmu() function
Prepare for the addition of RAPL core energy counter support.
Post which, one CPU might be mapped to more than one rapl_pmu
(package/die one and a core one). So, remove the cpu_to_rapl_pmu()
function.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
---
arch/x86/events/rapl.c | 24 +++++++++++-------------
1 file changed, 11 insertions(+), 13 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -157,22 +157,15 @@ static struct perf_msr *rapl_msrs;
* RAPL PMU scope.
*/
static inline unsigned int get_rapl_pmu_idx(int cpu)
-{
+{ /*
+ * Returns unsigned int, which converts the '-1' return value
+ * (for non-existent mappings in topology map) to UINT_MAX, so
+ * the error check in the caller is simplified.
+ */
return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
topology_logical_die_id(cpu);
}
-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
-{
- unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
-
- /*
- * The unsigned check also catches the '-1' return value for non
- * existent mappings in the topology map.
- */
- return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
-}
-
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
@@ -350,6 +343,7 @@ static int rapl_pmu_event_init(struct pe
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, ret = 0;
struct rapl_pmu *pmu;
+ unsigned int rapl_pmu_idx;
/* only look at RAPL events */
if (event->attr.type != rapl_pmus->pmu.type)
@@ -376,8 +370,12 @@ static int rapl_pmu_event_init(struct pe
if (event->attr.sample_period) /* no sampling */
return -EINVAL;
+ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
+ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ return -EINVAL;
+
/* must be done before validate_group */
- pmu = cpu_to_rapl_pmu(event->cpu);
+ pmu = rapl_pmus->pmus[rapl_pmu_idx];
if (!pmu)
return -EINVAL;
event->pmu_private = pmu;

View File

@@ -1,84 +0,0 @@
From fb0a3b5932882f02ed42fcaa6db73aba3eafd6d7 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:42 -0700
Subject: perf/x86/rapl: Move the pmu allocation out of CPU hotplug
The rapl pmu just needs to be allocated once. It doesn't matter to be
allocated at each CPU hotplug, or the global init_rapl_pmus().
Move the pmu allocation to the init_rapl_pmus(). So the generic hotplug
supports can be applied.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 44 +++++++++++++++++++++++++++++-------------
1 file changed, 31 insertions(+), 13 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -568,19 +568,8 @@ static int rapl_cpu_online(unsigned int
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
int target;
- if (!pmu) {
- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
- if (!pmu)
- return -ENOMEM;
-
- raw_spin_lock_init(&pmu->lock);
- INIT_LIST_HEAD(&pmu->active_list);
- pmu->pmu = &rapl_pmus->pmu;
- pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
- rapl_hrtimer_init(pmu);
-
- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
- }
+ if (!pmu)
+ return -ENOMEM;
/*
* Check if there is an online cpu in the package which collects rapl
@@ -673,6 +662,32 @@ static const struct attribute_group *rap
NULL,
};
+static void __init init_rapl_pmu(void)
+{
+ struct rapl_pmu *pmu;
+ int cpu;
+
+ cpus_read_lock();
+
+ for_each_cpu(cpu, cpu_online_mask) {
+ pmu = cpu_to_rapl_pmu(cpu);
+ if (pmu)
+ continue;
+ pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+ if (!pmu)
+ continue;
+ raw_spin_lock_init(&pmu->lock);
+ INIT_LIST_HEAD(&pmu->active_list);
+ pmu->pmu = &rapl_pmus->pmu;
+ pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ rapl_hrtimer_init(pmu);
+
+ rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
+ }
+
+ cpus_read_unlock();
+}
+
static int __init init_rapl_pmus(void)
{
int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
@@ -693,6 +708,9 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.read = rapl_pmu_event_read;
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+
+ init_rapl_pmu();
+
return 0;
}

View File

@@ -1,6 +1,6 @@
From 07ec9f38cac6eb6e5b0b062ef99e9458ba567de8 Mon Sep 17 00:00:00 2001
From 30e2cd787aeb9cb9c1148e07446aac76765f715e Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:21:43 +0000
Date: Fri, 15 Nov 2024 06:08:00 +0000
Subject: perf/x86/rapl: Rename rapl_pmu variables
Rename struct rapl_pmu variables from "pmu" to "rapl_pmu", to
@@ -15,13 +15,16 @@ Also rename "pmus" member in rapl_pmus struct, for same reason.
No functional change.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
---
arch/x86/events/rapl.c | 93 +++++++++++++++++++++---------------------
1 file changed, 47 insertions(+), 46 deletions(-)
arch/x86/events/rapl.c | 91 +++++++++++++++++++++---------------------
1 file changed, 46 insertions(+), 45 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -116,7 +116,7 @@ struct rapl_pmu {
@@ -129,7 +129,7 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
unsigned int nr_rapl_pmu;
@@ -30,7 +33,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
};
enum rapl_unit_quirk {
@@ -223,34 +223,34 @@ static void rapl_start_hrtimer(struct ra
@@ -227,34 +227,34 @@ static void rapl_start_hrtimer(struct ra
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
@@ -74,7 +77,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
struct perf_event *event)
{
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
@@ -258,39 +258,39 @@ static void __rapl_pmu_event_start(struc
@@ -262,39 +262,39 @@ static void __rapl_pmu_event_start(struc
event->hw.state = 0;
@@ -128,7 +131,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
list_del(&event->active_entry);
@@ -308,23 +308,23 @@ static void rapl_pmu_event_stop(struct p
@@ -312,23 +312,23 @@ static void rapl_pmu_event_stop(struct p
hwc->state |= PERF_HES_UPTODATE;
}
@@ -157,16 +160,16 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
return 0;
}
@@ -338,7 +338,7 @@ static int rapl_pmu_event_init(struct pe
@@ -342,7 +342,7 @@ static int rapl_pmu_event_init(struct pe
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, rapl_pmu_idx, ret = 0;
int bit, ret = 0;
- struct rapl_pmu *pmu;
+ struct rapl_pmu *rapl_pmu;
unsigned int rapl_pmu_idx;
/* only look at RAPL events */
if (event->attr.type != rapl_pmus->pmu.type)
@@ -370,10 +370,11 @@ static int rapl_pmu_event_init(struct pe
@@ -375,10 +375,11 @@ static int rapl_pmu_event_init(struct pe
return -EINVAL;
/* must be done before validate_group */
@@ -181,7 +184,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
event->hw.event_base = rapl_msrs[bit].msr;
event->hw.config = cfg;
event->hw.idx = bit;
@@ -600,7 +601,7 @@ static void cleanup_rapl_pmus(void)
@@ -605,7 +606,7 @@ static void cleanup_rapl_pmus(void)
int i;
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
@@ -190,29 +193,21 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
kfree(rapl_pmus);
}
@@ -615,7 +616,7 @@ static const struct attribute_group *rap
@@ -620,27 +621,27 @@ static const struct attribute_group *rap
static void __init init_rapl_pmu(void)
static int __init init_rapl_pmu(void)
{
- struct rapl_pmu *pmu;
+ struct rapl_pmu *rapl_pmu;
int cpu, rapl_pmu_idx;
int idx;
cpus_read_lock();
@@ -625,19 +626,19 @@ static void __init init_rapl_pmu(void)
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
continue;
- pmu = rapl_pmus->pmus[rapl_pmu_idx];
- if (pmu)
+ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
+ if (rapl_pmu)
continue;
- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
- pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
- if (!pmu)
+ rapl_pmu = kzalloc_node(sizeof(*rapl_pmu), GFP_KERNEL, cpu_to_node(cpu));
+ rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
+ if (!rapl_pmu)
continue;
goto free;
- raw_spin_lock_init(&pmu->lock);
- INIT_LIST_HEAD(&pmu->active_list);
- pmu->pmu = &rapl_pmus->pmu;
@@ -224,13 +219,20 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+ rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ rapl_hrtimer_init(rapl_pmu);
- rapl_pmus->pmus[rapl_pmu_idx] = pmu;
+ rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu;
- rapl_pmus->pmus[idx] = pmu;
+ rapl_pmus->rapl_pmu[idx] = rapl_pmu;
}
cpus_read_unlock();
@@ -653,7 +654,7 @@ static int __init init_rapl_pmus(void)
rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
return 0;
free:
for (; idx > 0; idx--)
- kfree(rapl_pmus->pmus[idx - 1]);
+ kfree(rapl_pmus->rapl_pmu[idx - 1]);
return -ENOMEM;
}
@@ -654,7 +655,7 @@ static int __init init_rapl_pmus(void)
rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
}
- rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);

View File

@@ -1,9 +1,9 @@
From 68614752b9fd6b6bae6f9ab7b02fc28350c5a541 Mon Sep 17 00:00:00 2001
From 0f84d3bd0af2c300fe13d9f0f5131d0747a13f9e Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:47:56 +0000
Date: Fri, 15 Nov 2024 06:08:01 +0000
Subject: perf/x86/rapl: Make rapl_model struct global
Preparation for per-core energy counter support addition for AMD CPUs.
Prepare for the addition of RAPL core energy counter support.
As there will always be just one rapl_model variable on a system, make it
global, to make it easier to access it from any function.
@@ -11,21 +11,24 @@ global, to make it easier to access it from any function.
No functional change.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
arch/x86/events/rapl.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -138,6 +138,7 @@ static struct rapl_pmus *rapl_pmus;
@@ -151,6 +151,7 @@ static struct rapl_pmus *rapl_pmus;
static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
+static struct rapl_model *rapl_model;
/*
* RAPL Package energy counter scope:
@@ -536,18 +537,18 @@ static struct perf_msr amd_rapl_msrs[] =
* Helper function to get the correct topology id according to the
@@ -541,18 +542,18 @@ static struct perf_msr amd_rapl_msrs[] =
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
@@ -47,7 +50,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
/*
* DRAM domain on HSW server and KNL has fixed energy unit which can be
* different than the unit from power unit MSR. See
@@ -798,21 +799,20 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
@@ -797,21 +798,20 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;

View File

@@ -1,20 +1,26 @@
From b10b887510ccb0b6bc7294888982b862703c9c32 Mon Sep 17 00:00:00 2001
From 395cf3513aaf79c95c611ce97ba451eaf6470c44 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:47:57 +0000
Subject: perf/x86/rapl: Add arguments to the cleanup and init functions
Date: Fri, 15 Nov 2024 06:08:02 +0000
Subject: perf/x86/rapl: Add arguments to the init and cleanup functions
Prep for per-core RAPL PMU addition.
Prepare for the addition of RAPL core energy counter support.
Add arguments to the init and cleanup functions, which will help in
initialization and cleaning up of two separate PMUs.
No functional change.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
---
arch/x86/events/rapl.c | 32 +++++++++++++++++++-------------
1 file changed, 19 insertions(+), 13 deletions(-)
arch/x86/events/rapl.c | 28 ++++++++++++++++------------
1 file changed, 16 insertions(+), 12 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -597,7 +597,7 @@ static void __init rapl_advertise(void)
@@ -602,7 +602,7 @@ static void __init rapl_advertise(void)
}
}
@@ -23,35 +29,32 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
{
int i;
@@ -615,7 +615,7 @@ static const struct attribute_group *rap
@@ -620,7 +620,7 @@ static const struct attribute_group *rap
NULL,
};
-static void __init init_rapl_pmu(void)
+static void __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
-static int __init init_rapl_pmu(void)
+static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
{
struct rapl_pmu *rapl_pmu;
int cpu, rapl_pmu_idx;
@@ -645,20 +645,22 @@ static void __init init_rapl_pmu(void)
cpus_read_unlock();
int idx;
@@ -646,20 +646,20 @@ free:
return -ENOMEM;
}
-static int __init init_rapl_pmus(void)
+static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope)
{
- int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
- int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+ int nr_rapl_pmu;
int nr_rapl_pmu = topology_max_packages();
- int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
+ struct rapl_pmus *rapl_pmus;
- if (rapl_pmu_is_pkg_scope()) {
- nr_rapl_pmu = topology_max_packages();
- rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
- if (!rapl_pmu_is_pkg_scope()) {
- nr_rapl_pmu *= topology_max_dies_per_package();
- rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
- }
+ if (rapl_pmu_scope == PERF_PMU_SCOPE_PKG)
+ nr_rapl_pmu = topology_max_packages();
+ else
+ nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+ if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
+ nr_rapl_pmu *= topology_max_dies_per_package();
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
@@ -62,16 +65,16 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
rapl_pmus->pmu.attr_update = rapl_attr_update;
@@ -673,7 +675,7 @@ static int __init init_rapl_pmus(void)
@@ -674,7 +674,7 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
- init_rapl_pmu();
+ init_rapl_pmu(rapl_pmus);
return 0;
- return init_rapl_pmu();
+ return init_rapl_pmu(rapl_pmus);
}
@@ -799,8 +801,12 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
static struct rapl_model model_snb = {
@@ -798,8 +798,12 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
@@ -84,7 +87,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
id = x86_match_cpu(rapl_model_match);
if (!id)
return -ENODEV;
@@ -816,7 +822,7 @@ static int __init rapl_pmu_init(void)
@@ -815,7 +819,7 @@ static int __init rapl_pmu_init(void)
if (ret)
return ret;
@@ -93,7 +96,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
if (ret)
return ret;
@@ -829,7 +835,7 @@ static int __init rapl_pmu_init(void)
@@ -828,7 +832,7 @@ static int __init rapl_pmu_init(void)
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
@@ -102,7 +105,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
return ret;
}
module_init(rapl_pmu_init);
@@ -837,6 +843,6 @@ module_init(rapl_pmu_init);
@@ -836,6 +840,6 @@ module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
perf_pmu_unregister(&rapl_pmus->pmu);

View File

@@ -1,101 +0,0 @@
From f1525664ff9da3241b3556594dc0b67506ae1ddd Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Tue, 10 Sep 2024 14:25:05 +0530
Subject: perf/x86/rapl: Fix the energy-pkg event for AMD CPUs
After commit ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf"),
on AMD processors that support extended CPUID leaf 0x80000026, the
topology_die_cpumask() and topology_logical_die_id() macros, no longer
return the package cpumask and package id, instead they return the CCD
(Core Complex Die) mask and id respectively. This leads to the energy-pkg
event scope to be modified to CCD instead of package.
So, change the PMU scope for AMD and Hygon back to package.
On a 12 CCD 1 Package AMD Zen4 Genoa machine:
Before:
$ cat /sys/devices/power/cpumask
0,8,16,24,32,40,48,56,64,72,80,88.
The expected cpumask here is supposed to be just "0", as it is a package
scope event, only one CPU will be collecting the event for all the CPUs in
the package.
After:
$ cat /sys/devices/power/cpumask
0
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 35 ++++++++++++++++++++++++++++++++---
1 file changed, 32 insertions(+), 3 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -139,9 +139,32 @@ static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
+/*
+ * RAPL Package energy counter scope:
+ * 1. AMD/HYGON platforms have a per-PKG package energy counter
+ * 2. For Intel platforms
+ * 2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope
+ * 2.2. Other Intel platforms are single die systems so the scope can be
+ * considered as either pkg-scope or die-scope, and we are considering
+ * them as die-scope.
+ */
+#define rapl_pmu_is_pkg_scope() \
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+
+/*
+ * Helper function to get the correct topology id according to the
+ * RAPL PMU scope.
+ */
+static inline unsigned int get_rapl_pmu_idx(int cpu)
+{
+ return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+ topology_logical_die_id(cpu);
+}
+
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
- unsigned int rapl_pmu_idx = topology_logical_die_id(cpu);
+ unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
/*
* The unsigned check also catches the '-1' return value for non
@@ -617,7 +640,7 @@ static void __init init_rapl_pmu(void)
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
+ rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu;
}
cpus_read_unlock();
@@ -626,6 +649,12 @@ static void __init init_rapl_pmu(void)
static int __init init_rapl_pmus(void)
{
int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+ int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+
+ if (rapl_pmu_is_pkg_scope()) {
+ nr_rapl_pmu = topology_max_packages();
+ rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
+ }
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
@@ -641,8 +670,8 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.start = rapl_pmu_event_start;
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
rapl_pmus->pmu.read = rapl_pmu_event_read;
+ rapl_pmus->pmu.scope = rapl_pmu_scope;
rapl_pmus->pmu.module = THIS_MODULE;
- rapl_pmus->pmu.scope = PERF_PMU_SCOPE_DIE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
init_rapl_pmu();

View File

@@ -1,20 +1,22 @@
From b5c83c40540298a39f8314034b705f1236b17a9f Mon Sep 17 00:00:00 2001
From 98d0cb818ba4695b9a41ab83b7c00ec1cbdf1b35 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:47:58 +0000
Date: Fri, 15 Nov 2024 06:08:03 +0000
Subject: perf/x86/rapl: Modify the generic variable names to *_pkg*
Prep for addition of power_per_core PMU to handle core scope energy
consumption for AMD CPUs.
Prepare for the addition of RAPL core energy counter support.
Replace the generic names with *_pkg*, to differentiate between the
scopes of the two different PMUs and their variables.
Replace the generic names with *_pkg*, to later on differentiate between
the scopes of the two different PMUs and their variables.
No functional change.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
---
arch/x86/events/rapl.c | 118 ++++++++++++++++++++---------------------
1 file changed, 59 insertions(+), 59 deletions(-)
arch/x86/events/rapl.c | 120 ++++++++++++++++++++---------------------
1 file changed, 60 insertions(+), 60 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -41,7 +43,16 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
"pp0-core",
"package",
"dram",
@@ -126,16 +126,16 @@ enum rapl_unit_quirk {
@@ -112,7 +112,7 @@ static struct perf_pmu_events_attr event
* considered as either pkg-scope or die-scope, and we are considering
* them as die-scope.
*/
-#define rapl_pmu_is_pkg_scope() \
+#define rapl_pkg_pmu_is_pkg_scope() \
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
@@ -139,16 +139,16 @@ enum rapl_unit_quirk {
};
struct rapl_model {
@@ -63,25 +74,18 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
static struct rapl_model *rapl_model;
@@ -149,7 +149,7 @@ static struct rapl_model *rapl_model;
* considered as either pkg-scope or die-scope, and we are considering
* them as die-scope.
*/
-#define rapl_pmu_is_pkg_scope() \
+#define rapl_pkg_pmu_is_pkg_scope() \
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
@@ -159,7 +159,7 @@ static struct rapl_model *rapl_model;
*/
static inline unsigned int get_rapl_pmu_idx(int cpu)
{
@@ -163,8 +163,8 @@ static inline unsigned int get_rapl_pmu_
* (for non-existent mappings in topology map) to UINT_MAX, so
* the error check in the caller is simplified.
*/
- return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
- topology_logical_die_id(cpu);
+ return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
topology_logical_die_id(cpu);
+ topology_logical_die_id(cpu);
}
@@ -172,7 +172,7 @@ static inline u64 rapl_read_counter(stru
static inline u64 rapl_read_counter(struct perf_event *event)
@@ -176,7 +176,7 @@ static inline u64 rapl_read_counter(stru
static inline u64 rapl_scale(u64 v, int cfg)
{
@@ -90,7 +94,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
return v;
}
@@ -182,7 +182,7 @@ static inline u64 rapl_scale(u64 v, int
@@ -186,7 +186,7 @@ static inline u64 rapl_scale(u64 v, int
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
@@ -99,8 +103,8 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
}
static u64 rapl_event_update(struct perf_event *event)
@@ -342,7 +342,7 @@ static int rapl_pmu_event_init(struct pe
struct rapl_pmu *rapl_pmu;
@@ -347,7 +347,7 @@ static int rapl_pmu_event_init(struct pe
unsigned int rapl_pmu_idx;
/* only look at RAPL events */
- if (event->attr.type != rapl_pmus->pmu.type)
@@ -108,7 +112,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
return -ENOENT;
/* check only supported bits are set */
@@ -352,14 +352,14 @@ static int rapl_pmu_event_init(struct pe
@@ -357,14 +357,14 @@ static int rapl_pmu_event_init(struct pe
if (event->cpu < 0)
return -EINVAL;
@@ -126,7 +130,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
return -EINVAL;
/* unsupported modes and filters */
@@ -367,11 +367,11 @@ static int rapl_pmu_event_init(struct pe
@@ -372,11 +372,11 @@ static int rapl_pmu_event_init(struct pe
return -EINVAL;
rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
@@ -140,7 +144,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
if (!rapl_pmu)
return -EINVAL;
@@ -525,11 +525,11 @@ static struct perf_msr intel_rapl_spr_ms
@@ -530,11 +530,11 @@ static struct perf_msr intel_rapl_spr_ms
};
/*
@@ -155,7 +159,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
[PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
@@ -545,8 +545,8 @@ static int rapl_check_hw_unit(void)
@@ -550,8 +550,8 @@ static int rapl_check_hw_unit(void)
/* protect rdmsrl() to handle virtualization */
if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
return -1;
@@ -166,7 +170,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
switch (rapl_model->unit_quirk) {
/*
@@ -556,11 +556,11 @@ static int rapl_check_hw_unit(void)
@@ -561,11 +561,11 @@ static int rapl_check_hw_unit(void)
* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
*/
case RAPL_UNIT_QUIRK_INTEL_HSW:
@@ -180,7 +184,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
break;
default:
break;
@@ -575,9 +575,9 @@ static int rapl_check_hw_unit(void)
@@ -580,9 +580,9 @@ static int rapl_check_hw_unit(void)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
rapl_timer_ms = 2;
@@ -192,7 +196,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
}
return 0;
}
@@ -587,12 +587,12 @@ static void __init rapl_advertise(void)
@@ -592,12 +592,12 @@ static void __init rapl_advertise(void)
int i;
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
@@ -209,7 +213,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
}
}
}
@@ -681,71 +681,71 @@ static int __init init_rapl_pmus(struct
@@ -678,71 +678,71 @@ static int __init init_rapl_pmus(struct
}
static struct rapl_model model_snb = {
@@ -297,7 +301,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
@@ -801,11 +801,11 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
@@ -798,11 +798,11 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
@@ -312,7 +316,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
id = x86_match_cpu(rapl_model_match);
if (!id)
@@ -813,20 +813,20 @@ static int __init rapl_pmu_init(void)
@@ -810,20 +810,20 @@ static int __init rapl_pmu_init(void)
rapl_model = (struct rapl_model *) id->driver_data;
@@ -338,7 +342,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
if (ret)
goto out;
@@ -835,14 +835,14 @@ static int __init rapl_pmu_init(void)
@@ -832,14 +832,14 @@ static int __init rapl_pmu_init(void)
out:
pr_warn("Initialization failed (%d), disabled\n", ret);

View File

@@ -1,87 +0,0 @@
From b8e1231d5f78314de8f9066baba7b1fdd5e59218 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:21:42 +0000
Subject: perf/x86/rapl: Remove the cpu_to_rapl_pmu() function
Preparation for the addition of per-core RAPL energy counter support for
AMD CPUs. Post which, one cpu might be mapped to more than one rapl_pmu
(package/die one or per-core one), also makes sense to use the
get_rapl_pmu_idx macro which is anyway used to index into the
rapl_pmus->pmus[] array.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 29 +++++++++++++----------------
1 file changed, 13 insertions(+), 16 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -162,17 +162,6 @@ static inline unsigned int get_rapl_pmu_
topology_logical_die_id(cpu);
}
-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
-{
- unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
-
- /*
- * The unsigned check also catches the '-1' return value for non
- * existent mappings in the topology map.
- */
- return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
-}
-
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
@@ -348,7 +337,7 @@ static void rapl_pmu_event_del(struct pe
static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
- int bit, ret = 0;
+ int bit, rapl_pmu_idx, ret = 0;
struct rapl_pmu *pmu;
/* only look at RAPL events */
@@ -376,8 +365,12 @@ static int rapl_pmu_event_init(struct pe
if (event->attr.sample_period) /* no sampling */
return -EINVAL;
+ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
+ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ return -EINVAL;
+
/* must be done before validate_group */
- pmu = cpu_to_rapl_pmu(event->cpu);
+ pmu = rapl_pmus->pmus[rapl_pmu_idx];
if (!pmu)
return -EINVAL;
event->pmu_private = pmu;
@@ -623,12 +616,16 @@ static const struct attribute_group *rap
static void __init init_rapl_pmu(void)
{
struct rapl_pmu *pmu;
- int cpu;
+ int cpu, rapl_pmu_idx;
cpus_read_lock();
for_each_cpu(cpu, cpu_online_mask) {
- pmu = cpu_to_rapl_pmu(cpu);
+ rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ continue;
+
+ pmu = rapl_pmus->pmus[rapl_pmu_idx];
if (pmu)
continue;
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
@@ -640,7 +637,7 @@ static void __init init_rapl_pmu(void)
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
- rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu;
+ rapl_pmus->pmus[rapl_pmu_idx] = pmu;
}
cpus_read_unlock();

View File

@@ -1,23 +1,24 @@
From dbc0343069c8f86fad0d8d9075f70f79114ef10a Mon Sep 17 00:00:00 2001
From ea8de8012d6d6ef2c24c45a56f230e3a7fcb8ce7 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:47:59 +0000
Date: Fri, 15 Nov 2024 06:08:04 +0000
Subject: perf/x86/rapl: Remove the global variable rapl_msrs
Prepare for the addition of RAPL core energy counter support.
After making the rapl_model struct global, the rapl_msrs global
variable isn't needed, so remove it.
Also it will be cleaner when new per-core scope PMU is added. As we will
need to maintain two rapl_msrs array(one for per-core scope and one for
package scope PMU), inside the rapl_model struct.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
---
arch/x86/events/rapl.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -137,7 +137,6 @@ static int rapl_pkg_hw_unit[NR_RAPL_PKG_
@@ -150,7 +150,6 @@ static int rapl_pkg_hw_unit[NR_RAPL_PKG_
static struct rapl_pmus *rapl_pmus_pkg;
static unsigned int rapl_pkg_cntr_mask;
static u64 rapl_timer_ms;
@@ -25,7 +26,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
static struct rapl_model *rapl_model;
/*
@@ -376,7 +375,7 @@ static int rapl_pmu_event_init(struct pe
@@ -381,7 +380,7 @@ static int rapl_pmu_event_init(struct pe
return -EINVAL;
event->pmu_private = rapl_pmu;
@@ -34,7 +35,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
event->hw.config = cfg;
event->hw.idx = bit;
@@ -813,9 +812,7 @@ static int __init rapl_pmu_init(void)
@@ -810,9 +809,7 @@ static int __init rapl_pmu_init(void)
rapl_model = (struct rapl_model *) id->driver_data;

View File

@@ -1,24 +1,26 @@
From d6a5a28382558b896767a78db795d421015831a7 Mon Sep 17 00:00:00 2001
From 8cbd0e18e5b36bbfdb5ca8931a7668c7963be8e5 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:48:00 +0000
Date: Fri, 15 Nov 2024 06:08:05 +0000
Subject: perf/x86/rapl: Move the cntr_mask to rapl_pmus struct
Preparation for the addition of per-core RAPL energy counter for AMD
CPUs.
Prepare for the addition of RAPL core energy counter support.
Moving cntr_mask to rapl_pmus struct instead of adding a new global
cntr_mask for the per-core RAPL energy counter, will ensure that the
"per_core_cntr_mask" is only created if needed (i.e. in case of AMD
CPUs).
Move cntr_mask to rapl_pmus struct instead of adding a new global
cntr_mask for the new RAPL power_core PMU. This will also ensure that
the second "core_cntr_mask" is only created if needed (i.e. in case of
AMD CPUs).
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Zhang Rui <rui.zhang@intel.com>
---
arch/x86/events/rapl.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -116,6 +116,7 @@ struct rapl_pmu {
@@ -129,6 +129,7 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
unsigned int nr_rapl_pmu;
@@ -26,7 +28,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
};
@@ -135,7 +136,6 @@ struct rapl_model {
@@ -148,7 +149,6 @@ struct rapl_model {
/* 1/2^hw_unit Joule */
static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus_pkg;
@@ -34,7 +36,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
static u64 rapl_timer_ms;
static struct rapl_model *rapl_model;
@@ -358,7 +358,7 @@ static int rapl_pmu_event_init(struct pe
@@ -363,7 +363,7 @@ static int rapl_pmu_event_init(struct pe
bit = cfg - 1;
/* check event supported */
@@ -43,7 +45,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
return -EINVAL;
/* unsupported modes and filters */
@@ -586,10 +586,10 @@ static void __init rapl_advertise(void)
@@ -591,10 +591,10 @@ static void __init rapl_advertise(void)
int i;
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
@@ -56,7 +58,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
pr_info("hw unit of domain %s 2^-%d Joules\n",
rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
}
@@ -812,9 +812,6 @@ static int __init rapl_pmu_init(void)
@@ -809,9 +809,6 @@ static int __init rapl_pmu_init(void)
rapl_model = (struct rapl_model *) id->driver_data;
@@ -66,7 +68,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
ret = rapl_check_hw_unit();
if (ret)
return ret;
@@ -823,6 +820,10 @@ static int __init rapl_pmu_init(void)
@@ -820,6 +817,10 @@ static int __init rapl_pmu_init(void)
if (ret)
return ret;

View File

@@ -1,39 +1,43 @@
From 3cb480ec2950f4c6351c602552fc4f9a8e524b89 Mon Sep 17 00:00:00 2001
From 27bdf22a4c7815831d38acd1cb08e5aa6ce95ea0 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:48:01 +0000
Subject: perf/x86/rapl: Add per-core energy counter support for AMD CPUs
Date: Fri, 15 Nov 2024 06:08:06 +0000
Subject: perf/x86/rapl: Add core energy counter support for AMD CPUs
Add a new "power_per_core" PMU and "energy-per-core" event for
monitoring energy consumption by each core. The existing energy-cores
event aggregates the energy consumption at the package level.
This new event aligns with the AMD's per_core energy counters.
Add a new "power_core" PMU and "energy-core" event for monitoring
energy consumption by each individual core. The existing energy-cores
event aggregates the energy consumption of CPU cores at the package level.
This new event aligns with the AMD's per-core energy counters.
Tested the package level and core level PMU counters with workloads
pinned to different CPUs.
Results with workload pinned to CPU 1 in core 1 on a AMD Zen4 Genoa
Results with workload pinned to CPU 4 in core 4 on an AMD Zen4 Genoa
machine:
$ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
$ sudo perf stat --per-core -e power_core/energy-core/ -- taskset -c 4 stress-ng --matrix 1 --timeout 5s
stress-ng: info: [21250] setting to a 5 second run per stressor
stress-ng: info: [21250] dispatching hogs: 1 matrix
stress-ng: info: [21250] successful run completed in 5.00s
Performance counter stats for 'system wide':
S0-D0-C0 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C1 1 5.72 Joules power_per_core/energy-per-core/
S0-D0-C2 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C3 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C4 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C5 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C6 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C7 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C8 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C9 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C10 1 0.02 Joules power_per_core/energy-per-core/
S0-D0-C0 1 0.00 Joules power_core/energy-core/
S0-D0-C1 1 0.00 Joules power_core/energy-core/
S0-D0-C2 1 0.00 Joules power_core/energy-core/
S0-D0-C3 1 0.00 Joules power_core/energy-core/
S0-D0-C4 1 8.43 Joules power_core/energy-core/
S0-D0-C5 1 0.00 Joules power_core/energy-core/
S0-D0-C6 1 0.00 Joules power_core/energy-core/
S0-D0-C7 1 0.00 Joules power_core/energy-core/
S0-D1-C8 1 0.00 Joules power_core/energy-core/
S0-D1-C9 1 0.00 Joules power_core/energy-core/
S0-D1-C10 1 0.00 Joules power_core/energy-core/
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
arch/x86/events/rapl.c | 178 +++++++++++++++++++++++++++++++++--------
1 file changed, 143 insertions(+), 35 deletions(-)
arch/x86/events/rapl.c | 185 +++++++++++++++++++++++++++++++++--------
1 file changed, 152 insertions(+), 33 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -41,8 +45,8 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
* event: rapl_energy_psys
* perf code: 0x5
*
+ * per_core counter: consumption of a single physical core
+ * event: rapl_energy_per_core (power_per_core PMU)
+ * core counter: consumption of a single physical core
+ * event: rapl_energy_core (power_core PMU)
+ * perf code: 0x1
+ *
* We manage those counters as free running (read-only). They may be
@@ -52,7 +56,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
};
+#define PERF_RAPL_PER_CORE 0 /* per-core */
+#define PERF_RAPL_CORE 0 /* single core */
+#define PERF_RAPL_CORE_EVENTS_MAX 1
+#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
+
@@ -63,12 +67,12 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
"psys",
};
+static const char *const rapl_core_domain_name __initconst = "per-core";
+static const char *const rapl_core_domain_name __initconst = "core";
+
/*
* event code: LSB 8 bits, passed in attr->config
* any other bit is reserved
@@ -128,14 +138,18 @@ enum rapl_unit_quirk {
@@ -141,14 +151,18 @@ enum rapl_unit_quirk {
struct rapl_model {
struct perf_msr *rapl_pkg_msrs;
@@ -87,25 +91,35 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
static u64 rapl_timer_ms;
static struct rapl_model *rapl_model;
@@ -156,10 +170,14 @@ static struct rapl_model *rapl_model;
@@ -156,14 +170,23 @@ static struct rapl_model *rapl_model;
* Helper function to get the correct topology id according to the
* RAPL PMU scope.
*/
-static inline unsigned int get_rapl_pmu_idx(int cpu)
-{ /*
+static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
{
+{
+ /*
* Returns unsigned int, which converts the '-1' return value
* (for non-existent mappings in topology map) to UINT_MAX, so
* the error check in the caller is simplified.
*/
- return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
- topology_logical_die_id(cpu);
+ if (scope == PERF_PMU_SCOPE_PKG)
- topology_logical_die_id(cpu);
+ switch (scope) {
+ case PERF_PMU_SCOPE_PKG:
+ return topology_logical_package_id(cpu);
+ else if (scope == PERF_PMU_SCOPE_DIE)
+ case PERF_PMU_SCOPE_DIE:
+ return topology_logical_die_id(cpu);
+ else
+ case PERF_PMU_SCOPE_CORE:
+ return topology_logical_core_id(cpu);
+ default:
+ return -EINVAL;
+ }
}
static inline u64 rapl_read_counter(struct perf_event *event)
@@ -169,19 +187,20 @@ static inline u64 rapl_read_counter(stru
@@ -173,19 +196,20 @@ static inline u64 rapl_read_counter(stru
return raw;
}
@@ -132,7 +146,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
}
static u64 rapl_event_update(struct perf_event *event)
@@ -208,7 +227,7 @@ static u64 rapl_event_update(struct perf
@@ -212,7 +236,7 @@ static u64 rapl_event_update(struct perf
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
@@ -141,13 +155,14 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
local64_add(sdelta, &event->count);
@@ -337,12 +356,13 @@ static void rapl_pmu_event_del(struct pe
@@ -341,13 +365,14 @@ static void rapl_pmu_event_del(struct pe
static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
- int bit, rapl_pmu_idx, ret = 0;
+ int bit, rapl_pmus_scope, rapl_pmu_idx, ret = 0;
- int bit, ret = 0;
+ int bit, rapl_pmus_scope, ret = 0;
struct rapl_pmu *rapl_pmu;
unsigned int rapl_pmu_idx;
+ struct rapl_pmus *rapl_pmus;
- /* only look at RAPL events */
@@ -159,7 +174,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
/* check only supported bits are set */
if (event->attr.config & ~RAPL_EVENT_MASK)
@@ -351,31 +371,49 @@ static int rapl_pmu_event_init(struct pe
@@ -356,31 +381,49 @@ static int rapl_pmu_event_init(struct pe
if (event->cpu < 0)
return -EINVAL;
@@ -186,7 +201,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+ bit = cfg - 1;
+ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
+ } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
+ /* only look at RAPL per-core events */
+ /* only look at RAPL core events */
+ if (event->attr.type != rapl_pmus_core->pmu.type)
+ return -ENOENT;
+
@@ -222,34 +237,34 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
event->hw.config = cfg;
event->hw.idx = bit;
@@ -392,12 +430,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl
@@ -397,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
+RAPL_EVENT_ATTR_STR(energy-per-core, rapl_per_core, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-per-core.unit, rapl_per_core_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
@@ -407,6 +447,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale,
@@ -412,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale,
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-per-core.scale, rapl_per_core_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10");
/*
* There are no default events, but we need to create
@@ -439,6 +480,12 @@ static const struct attribute_group *rap
@@ -444,6 +490,12 @@ static const struct attribute_group *rap
NULL,
};
+static const struct attribute_group *rapl_per_core_attr_groups[] = {
+static const struct attribute_group *rapl_core_attr_groups[] = {
+ &rapl_pmu_format_group,
+ &rapl_pmu_events_group,
+ NULL,
@@ -258,38 +273,38 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
static struct attribute *rapl_events_cores[] = {
EVENT_PTR(rapl_cores),
EVENT_PTR(rapl_cores_unit),
@@ -499,6 +546,18 @@ static struct attribute_group rapl_event
@@ -504,6 +556,18 @@ static struct attribute_group rapl_event
.attrs = rapl_events_psys,
};
+static struct attribute *rapl_events_per_core[] = {
+ EVENT_PTR(rapl_per_core),
+ EVENT_PTR(rapl_per_core_unit),
+ EVENT_PTR(rapl_per_core_scale),
+static struct attribute *rapl_events_core[] = {
+ EVENT_PTR(rapl_core),
+ EVENT_PTR(rapl_core_unit),
+ EVENT_PTR(rapl_core_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_events_per_core_group = {
+static struct attribute_group rapl_events_core_group = {
+ .name = "events",
+ .attrs = rapl_events_per_core,
+ .attrs = rapl_events_core,
+};
+
static bool test_msr(int idx, void *data)
{
return test_bit(idx, (unsigned long *) data);
@@ -536,6 +595,11 @@ static struct perf_msr amd_rapl_pkg_msrs
@@ -541,6 +605,11 @@ static struct perf_msr amd_rapl_pkg_msrs
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
+static struct perf_msr amd_rapl_core_msrs[] = {
+ [PERF_RAPL_PER_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_per_core_group,
+ [PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
+ test_msr, false, RAPL_MSR_MASK },
+};
+
static int rapl_check_hw_unit(void)
{
u64 msr_rapl_power_unit_bits;
@@ -547,6 +611,8 @@ static int rapl_check_hw_unit(void)
@@ -552,6 +621,8 @@ static int rapl_check_hw_unit(void)
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
@@ -298,7 +313,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
switch (rapl_model->unit_quirk) {
/*
* DRAM domain on HSW server and KNL has fixed energy unit which can be
@@ -565,7 +631,6 @@ static int rapl_check_hw_unit(void)
@@ -570,7 +641,6 @@ static int rapl_check_hw_unit(void)
break;
}
@@ -306,7 +321,7 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
/*
* Calculate the timer rate:
* Use reference of 200W for scaling the timeout to avoid counter
@@ -584,9 +649,13 @@ static int rapl_check_hw_unit(void)
@@ -589,9 +659,13 @@ static int rapl_check_hw_unit(void)
static void __init rapl_advertise(void)
{
int i;
@@ -321,42 +336,30 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
@@ -594,6 +663,10 @@ static void __init rapl_advertise(void)
@@ -599,6 +673,10 @@ static void __init rapl_advertise(void)
rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
}
}
+
+ if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_PER_CORE)))
+ if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
+ pr_info("hw unit of domain %s 2^-%d Joules\n",
+ rapl_core_domain_name, rapl_core_hw_unit);
}
static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
@@ -614,6 +687,10 @@ static const struct attribute_group *rap
@@ -619,6 +697,10 @@ static const struct attribute_group *rap
NULL,
};
+static const struct attribute_group *rapl_per_core_attr_update[] = {
+ &rapl_events_per_core_group,
+static const struct attribute_group *rapl_core_attr_update[] = {
+ &rapl_events_core_group,
+};
+
static void __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
{
struct rapl_pmu *rapl_pmu;
@@ -622,10 +699,9 @@ static void __init init_rapl_pmu(struct
cpus_read_lock();
for_each_cpu(cpu, cpu_online_mask) {
- rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+ rapl_pmu_idx = get_rapl_pmu_idx(cpu, rapl_pmus->pmu.scope);
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
continue;
-
rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
if (rapl_pmu)
continue;
@@ -644,15 +720,19 @@ static void __init init_rapl_pmu(struct
cpus_read_unlock();
@@ -645,13 +727,22 @@ free:
return -ENOMEM;
}
-static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope)
@@ -364,31 +367,33 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
+ const struct attribute_group **rapl_attr_groups,
+ const struct attribute_group **rapl_attr_update)
{
int nr_rapl_pmu;
int nr_rapl_pmu = topology_max_packages();
struct rapl_pmus *rapl_pmus;
if (rapl_pmu_scope == PERF_PMU_SCOPE_PKG)
nr_rapl_pmu = topology_max_packages();
- else
+ else if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+ else
+ nr_rapl_pmu = topology_max_packages() * topology_num_cores_per_package();
+ /*
+ * rapl_pmu_scope must be either PKG, DIE or CORE
+ */
if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
nr_rapl_pmu *= topology_max_dies_per_package();
+ else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
+ nr_rapl_pmu *= topology_num_cores_per_package();
+ else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
+ return -EINVAL;
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
@@ -743,8 +823,10 @@ static struct rapl_model model_spr = {
@@ -740,8 +831,10 @@ static struct rapl_model model_spr = {
static struct rapl_model model_amd_hygon = {
.pkg_events = BIT(PERF_RAPL_PKG),
+ .core_events = BIT(PERF_RAPL_PER_CORE),
+ .core_events = BIT(PERF_RAPL_CORE),
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
.rapl_pkg_msrs = amd_rapl_pkg_msrs,
+ .rapl_core_msrs = amd_rapl_core_msrs,
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
@@ -816,7 +898,8 @@ static int __init rapl_pmu_init(void)
@@ -813,7 +906,8 @@ static int __init rapl_pmu_init(void)
if (ret)
return ret;
@@ -398,35 +403,35 @@ Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
if (ret)
return ret;
@@ -828,6 +911,27 @@ static int __init rapl_pmu_init(void)
@@ -825,6 +919,27 @@ static int __init rapl_pmu_init(void)
if (ret)
goto out;
+ if (rapl_model->core_events) {
+ ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
+ rapl_per_core_attr_groups,
+ rapl_per_core_attr_update);
+ rapl_core_attr_groups,
+ rapl_core_attr_update);
+ if (ret) {
+ pr_warn("Per-core PMU initialization failed (%d)\n", ret);
+ goto per_core_init_failed;
+ pr_warn("power-core PMU initialization failed (%d)\n", ret);
+ goto core_init_failed;
+ }
+
+ rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
+ PERF_RAPL_CORE_EVENTS_MAX, false,
+ (void *) &rapl_model->core_events);
+
+ ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_per_core", -1);
+ ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
+ if (ret) {
+ pr_warn("Per-core PMU registration failed (%d)\n", ret);
+ pr_warn("power-core PMU registration failed (%d)\n", ret);
+ cleanup_rapl_pmus(rapl_pmus_core);
+ }
+ }
+
+per_core_init_failed:
+core_init_failed:
rapl_advertise();
return 0;
@@ -840,6 +944,10 @@ module_init(rapl_pmu_init);
@@ -837,6 +952,10 @@ module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{

View File

@@ -0,0 +1,21 @@
From c66779234fa7ba71cefa1c4eb283db53a3cb5303 Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@natalenko.name>
Date: Sat, 23 Nov 2024 12:45:28 +0100
Subject: amd-rapl-6.12: fix clang-built kernel not booting
Link: https://lore.kernel.org/lkml/7eaf557d-7e85-4fd3-abee-f84ac01d92c1@amd.com/
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
---
arch/x86/events/rapl.c | 1 +
1 file changed, 1 insertion(+)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -699,6 +699,7 @@ static const struct attribute_group *rap
static const struct attribute_group *rapl_core_attr_update[] = {
&rapl_events_core_group,
+ NULL,
};
static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)

View File

@@ -1,26 +1,35 @@
From d31e903a364802c068ff23bdd448cc70eda71a7c Mon Sep 17 00:00:00 2001
From 3d722d5259babc1650ab6cb1a8bbf27863af75f2 Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Thu, 5 Sep 2024 10:26:38 +0100
Subject: cpuidle: menu: Remove iowait influence
Remove CPU iowaiters influence on idle state selection.
Remove the menu notion of performance multiplier which increased with
the number of tasks that went to iowait sleep on this CPU and haven't
woken up yet.
Relying on iowait for cpuidle is problematic for a few reasons:
1. There is no guarantee that an iowaiting task will wake up on the
same CPU.
2. The task being in iowait says nothing about the idle duration, we
could be selecting shallower states for a long time.
3. The task being in iowait doesn't always imply a performance hit
with increased latency.
4. If there is such a performance hit, the number of iowaiting tasks
doesn't directly correlate.
5. The definition of iowait altogether is vague at best, it is
sprinkled across kernel code.
1. There is no guarantee that an iowaiting task will wake up on the
same CPU.
2. The task being in iowait says nothing about the idle duration, we
could be selecting shallower states for a long time.
3. The task being in iowait doesn't always imply a performance hit
with increased latency.
4. If there is such a performance hit, the number of iowaiting tasks
doesn't directly correlate.
5. The definition of iowait altogether is vague at best, it is
sprinkled across kernel code.
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/20240905092645.2885200-2-christian.loehle@arm.com
[ rjw: Minor edits in the changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
drivers/cpuidle/governors/menu.c | 76 ++++----------------------------
1 file changed, 9 insertions(+), 67 deletions(-)

View File

@@ -1,4 +1,4 @@
From 3f840a42780323a4437dd1a417488d141c33af15 Mon Sep 17 00:00:00 2001
From 4c13cc86a7d9f1e88e8090a94c792eb45d7ef1ef Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Thu, 5 Sep 2024 10:26:39 +0100
Subject: cpuidle: Prefer teo over menu governor

View File

@@ -1,39 +0,0 @@
From ca8c9368b6f28ef625716b03aa930acfb8afe158 Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Thu, 5 Sep 2024 10:26:40 +0100
Subject: TEST: cpufreq/schedutil: Linear iowait boost step
In preparation for capping iowait boost make the steps linear as
opposed to doubling.
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
kernel/sched/cpufreq_schedutil.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -267,7 +267,8 @@ static void sugov_iowait_boost(struct su
/* Double the boost at each request */
if (sg_cpu->iowait_boost) {
sg_cpu->iowait_boost =
- min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
+ min_t(unsigned int,
+ sg_cpu->iowait_boost + IOWAIT_BOOST_MIN, SCHED_CAPACITY_SCALE);
return;
}
@@ -308,11 +309,9 @@ static unsigned long sugov_iowait_apply(
/*
* No boost pending; reduce the boost value.
*/
- sg_cpu->iowait_boost >>= 1;
- if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
- sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost -= IOWAIT_BOOST_MIN;
+ if (!sg_cpu->iowait_boost)
return 0;
- }
}
sg_cpu->iowait_boost_pending = false;

View File

@@ -1,106 +0,0 @@
From 33f05bd16a4ac2f6f36c9eb88016e2375dcb597c Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Thu, 5 Sep 2024 10:26:41 +0100
Subject: TEST: cpufreq/schedutil: iowait boost cap sysfs
Add a knob to cap applied iowait_boost per sysfs.
This is to test for potential regressions.
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
kernel/sched/cpufreq_schedutil.c | 38 ++++++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -11,6 +11,7 @@
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
+ unsigned int iowait_boost_cap;
};
struct sugov_policy {
@@ -35,6 +36,8 @@ struct sugov_policy {
bool limits_changed;
bool need_freq_update;
+
+ unsigned int iowait_boost_cap;
};
struct sugov_cpu {
@@ -316,6 +319,9 @@ static unsigned long sugov_iowait_apply(
sg_cpu->iowait_boost_pending = false;
+ if (sg_cpu->iowait_boost > sg_cpu->sg_policy->iowait_boost_cap)
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->iowait_boost_cap;
+
/*
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
@@ -554,6 +560,14 @@ static ssize_t rate_limit_us_show(struct
return sprintf(buf, "%u\n", tunables->rate_limit_us);
}
+
+static ssize_t iowait_boost_cap_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->iowait_boost_cap);
+}
+
static ssize_t
rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
{
@@ -572,10 +586,30 @@ rate_limit_us_store(struct gov_attr_set
return count;
}
+static ssize_t
+iowait_boost_cap_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int iowait_boost_cap;
+
+ if (kstrtouint(buf, 10, &iowait_boost_cap))
+ return -EINVAL;
+
+ tunables->iowait_boost_cap = iowait_boost_cap;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+ sg_policy->iowait_boost_cap = iowait_boost_cap;
+
+ return count;
+}
+
static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+static struct governor_attr iowait_boost_cap = __ATTR_RW(iowait_boost_cap);
static struct attribute *sugov_attrs[] = {
&rate_limit_us.attr,
+ &iowait_boost_cap.attr,
NULL
};
ATTRIBUTE_GROUPS(sugov);
@@ -765,6 +799,8 @@ static int sugov_init(struct cpufreq_pol
tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+ tunables->iowait_boost_cap = SCHED_CAPACITY_SCALE;
+
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
@@ -833,6 +869,8 @@ static int sugov_start(struct cpufreq_po
sg_policy->limits_changed = false;
sg_policy->cached_raw_freq = 0;
+ sg_policy->iowait_boost_cap = SCHED_CAPACITY_SCALE;
+
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
if (policy_is_shared(policy))

View File

@@ -1,325 +0,0 @@
From 33eb6c08d7c615fad308001921c7b1148cbccfde Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Thu, 5 Sep 2024 10:26:42 +0100
Subject: cpufreq/schedutil: Remove iowait boost
iowait boost in schedutil was introduced by
commit ("21ca6d2c52f8 cpufreq: schedutil: Add iowait boosting").
with it more or less following intel_pstate's approach to increase
frequency after an iowait wakeup.
Behaviour that is piggy-backed onto iowait boost is problematic
due to a lot of reasons, so remove it.
For schedutil specifically these are some of the reasons:
1. Boosting is applied even in scenarios where it doesn't improve
throughput.
2. The boost is not accounted for in EAS: a) feec() will only consider
the actual task utilization for task placement, but another CPU might
be more energy-efficient at that capacity than the boosted one.)
b) When placing a non-IO task while a CPU is boosted compute_energy()
assumes a lower OPP than what is actually applied. This leads to
wrong EAS decisions.
3. Actual IO heavy workloads are hardly distinguished from infrequent
in_iowait wakeups.
4. The boost isn't accounted for in task placement.
5. The boost isn't associated with a task, it therefore lingers on the
rq even after the responsible task has migrated / stopped.
6. The boost isn't associated with a task, it therefore needs to ramp
up again when migrated.
7. Since schedutil doesn't know which task is getting woken up,
multiple unrelated in_iowait tasks lead to boosting.
8. Boosting is hard to control with UCLAMP_MAX (which is only active
when the task is on the rq, which for boosted tasks is usually not
the case for most of the time).
One benefit of schedutil specifically is the reliance on the
scheduler's utilization signals, which have evolved a lot since it's
original introduction. Some cases that benefitted from iowait boosting
in the past can now be covered by e.g. util_est.
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
kernel/sched/cpufreq_schedutil.c | 181 +------------------------------
1 file changed, 3 insertions(+), 178 deletions(-)
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -6,12 +6,9 @@
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
-
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
- unsigned int iowait_boost_cap;
};
struct sugov_policy {
@@ -36,8 +33,6 @@ struct sugov_policy {
bool limits_changed;
bool need_freq_update;
-
- unsigned int iowait_boost_cap;
};
struct sugov_cpu {
@@ -45,10 +40,6 @@ struct sugov_cpu {
struct sugov_policy *sg_policy;
unsigned int cpu;
- bool iowait_boost_pending;
- unsigned int iowait_boost;
- u64 last_update;
-
unsigned long util;
unsigned long bw_min;
@@ -198,137 +189,15 @@ unsigned long sugov_effective_cpu_perf(i
return max(min, max);
}
-static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
+static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
- util = max(util, boost);
sg_cpu->bw_min = min;
sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
}
-/**
- * sugov_iowait_reset() - Reset the IO boost status of a CPU.
- * @sg_cpu: the sugov data for the CPU to boost
- * @time: the update time from the caller
- * @set_iowait_boost: true if an IO boost has been requested
- *
- * The IO wait boost of a task is disabled after a tick since the last update
- * of a CPU. If a new IO wait boost is requested after more then a tick, then
- * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
- * efficiency by ignoring sporadic wakeups from IO.
- */
-static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
- bool set_iowait_boost)
-{
- s64 delta_ns = time - sg_cpu->last_update;
-
- /* Reset boost only if a tick has elapsed since last request */
- if (delta_ns <= TICK_NSEC)
- return false;
-
- sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
- sg_cpu->iowait_boost_pending = set_iowait_boost;
-
- return true;
-}
-
-/**
- * sugov_iowait_boost() - Updates the IO boost status of a CPU.
- * @sg_cpu: the sugov data for the CPU to boost
- * @time: the update time from the caller
- * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
- *
- * Each time a task wakes up after an IO operation, the CPU utilization can be
- * boosted to a certain utilization which doubles at each "frequent and
- * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
- * of the maximum OPP.
- *
- * To keep doubling, an IO boost has to be requested at least once per tick,
- * otherwise we restart from the utilization of the minimum OPP.
- */
-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
- unsigned int flags)
-{
- bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
-
- /* Reset boost if the CPU appears to have been idle enough */
- if (sg_cpu->iowait_boost &&
- sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
- return;
-
- /* Boost only tasks waking up after IO */
- if (!set_iowait_boost)
- return;
-
- /* Ensure boost doubles only one time at each request */
- if (sg_cpu->iowait_boost_pending)
- return;
- sg_cpu->iowait_boost_pending = true;
-
- /* Double the boost at each request */
- if (sg_cpu->iowait_boost) {
- sg_cpu->iowait_boost =
- min_t(unsigned int,
- sg_cpu->iowait_boost + IOWAIT_BOOST_MIN, SCHED_CAPACITY_SCALE);
- return;
- }
-
- /* First wakeup after IO: start with minimum boost */
- sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
-}
-
-/**
- * sugov_iowait_apply() - Apply the IO boost to a CPU.
- * @sg_cpu: the sugov data for the cpu to boost
- * @time: the update time from the caller
- * @max_cap: the max CPU capacity
- *
- * A CPU running a task which woken up after an IO operation can have its
- * utilization boosted to speed up the completion of those IO operations.
- * The IO boost value is increased each time a task wakes up from IO, in
- * sugov_iowait_apply(), and it's instead decreased by this function,
- * each time an increase has not been requested (!iowait_boost_pending).
- *
- * A CPU which also appears to have been idle for at least one tick has also
- * its IO boost utilization reset.
- *
- * This mechanism is designed to boost high frequently IO waiting tasks, while
- * being more conservative on tasks which does sporadic IO operations.
- */
-static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
- unsigned long max_cap)
-{
- /* No boost currently required */
- if (!sg_cpu->iowait_boost)
- return 0;
-
- /* Reset boost if the CPU appears to have been idle enough */
- if (sugov_iowait_reset(sg_cpu, time, false))
- return 0;
-
- if (!sg_cpu->iowait_boost_pending) {
- /*
- * No boost pending; reduce the boost value.
- */
- sg_cpu->iowait_boost -= IOWAIT_BOOST_MIN;
- if (!sg_cpu->iowait_boost)
- return 0;
- }
-
- sg_cpu->iowait_boost_pending = false;
-
- if (sg_cpu->iowait_boost > sg_cpu->sg_policy->iowait_boost_cap)
- sg_cpu->iowait_boost = sg_cpu->sg_policy->iowait_boost_cap;
-
- /*
- * sg_cpu->util is already in capacity scale; convert iowait_boost
- * into the same scale so we can compare.
- */
- return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
-}
-
#ifdef CONFIG_NO_HZ_COMMON
static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
{
@@ -356,18 +225,12 @@ static inline bool sugov_update_single_c
u64 time, unsigned long max_cap,
unsigned int flags)
{
- unsigned long boost;
-
- sugov_iowait_boost(sg_cpu, time, flags);
- sg_cpu->last_update = time;
-
ignore_dl_rate_limit(sg_cpu);
if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
return false;
- boost = sugov_iowait_apply(sg_cpu, time, max_cap);
- sugov_get_util(sg_cpu, boost);
+ sugov_get_util(sg_cpu);
return true;
}
@@ -468,11 +331,8 @@ static unsigned int sugov_next_freq_shar
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- unsigned long boost;
-
- boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
- sugov_get_util(j_sg_cpu, boost);
+ sugov_get_util(j_sg_cpu);
util = max(j_sg_cpu->util, util);
}
@@ -488,9 +348,6 @@ sugov_update_shared(struct update_util_d
raw_spin_lock(&sg_policy->update_lock);
- sugov_iowait_boost(sg_cpu, time, flags);
- sg_cpu->last_update = time;
-
ignore_dl_rate_limit(sg_cpu);
if (sugov_should_update_freq(sg_policy, time)) {
@@ -560,14 +417,6 @@ static ssize_t rate_limit_us_show(struct
return sprintf(buf, "%u\n", tunables->rate_limit_us);
}
-
-static ssize_t iowait_boost_cap_show(struct gov_attr_set *attr_set, char *buf)
-{
- struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
-
- return sprintf(buf, "%u\n", tunables->iowait_boost_cap);
-}
-
static ssize_t
rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
{
@@ -586,30 +435,10 @@ rate_limit_us_store(struct gov_attr_set
return count;
}
-static ssize_t
-iowait_boost_cap_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
-{
- struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
- struct sugov_policy *sg_policy;
- unsigned int iowait_boost_cap;
-
- if (kstrtouint(buf, 10, &iowait_boost_cap))
- return -EINVAL;
-
- tunables->iowait_boost_cap = iowait_boost_cap;
-
- list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
- sg_policy->iowait_boost_cap = iowait_boost_cap;
-
- return count;
-}
-
static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
-static struct governor_attr iowait_boost_cap = __ATTR_RW(iowait_boost_cap);
static struct attribute *sugov_attrs[] = {
&rate_limit_us.attr,
- &iowait_boost_cap.attr,
NULL
};
ATTRIBUTE_GROUPS(sugov);
@@ -799,8 +628,6 @@ static int sugov_init(struct cpufreq_pol
tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
- tunables->iowait_boost_cap = SCHED_CAPACITY_SCALE;
-
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
@@ -869,8 +696,6 @@ static int sugov_start(struct cpufreq_po
sg_policy->limits_changed = false;
sg_policy->cached_raw_freq = 0;
- sg_policy->iowait_boost_cap = SCHED_CAPACITY_SCALE;
-
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
if (policy_is_shared(policy))

View File

@@ -1,113 +0,0 @@
From af7bbb59c2411e985a5d79173af5686337b4af9b Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Thu, 5 Sep 2024 10:26:43 +0100
Subject: cpufreq: intel_pstate: Remove iowait boost
Analogous to schedutil, remove iowait boost for the same reasons.
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
drivers/cpufreq/intel_pstate.c | 50 ++--------------------------------
1 file changed, 3 insertions(+), 47 deletions(-)
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -191,7 +191,6 @@ struct global_params {
* @policy: CPUFreq policy value
* @update_util: CPUFreq utility callback information
* @update_util_set: CPUFreq utility callback is set
- * @iowait_boost: iowait-related boost fraction
* @last_update: Time of the last update.
* @pstate: Stores P state limits for this CPU
* @vid: Stores VID limits for this CPU
@@ -245,7 +244,6 @@ struct cpudata {
struct acpi_processor_performance acpi_perf_data;
bool valid_pss_table;
#endif
- unsigned int iowait_boost;
s16 epp_powersave;
s16 epp_policy;
s16 epp_default;
@@ -2136,28 +2134,7 @@ static inline void intel_pstate_update_u
{
cpu->sample.time = time;
- if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
- bool do_io = false;
-
- cpu->sched_flags = 0;
- /*
- * Set iowait_boost flag and update time. Since IO WAIT flag
- * is set all the time, we can't just conclude that there is
- * some IO bound activity is scheduled on this CPU with just
- * one occurrence. If we receive at least two in two
- * consecutive ticks, then we treat as boost candidate.
- */
- if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
- do_io = true;
-
- cpu->last_io_update = time;
-
- if (do_io)
- intel_pstate_hwp_boost_up(cpu);
-
- } else {
- intel_pstate_hwp_boost_down(cpu);
- }
+ intel_pstate_hwp_boost_down(cpu);
}
static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
@@ -2240,9 +2217,6 @@ static inline int32_t get_target_pstate(
busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
sample->tsc);
- if (busy_frac < cpu->iowait_boost)
- busy_frac = cpu->iowait_boost;
-
sample->busy_scaled = busy_frac * 100;
target = READ_ONCE(global.no_turbo) ?
@@ -2303,7 +2277,7 @@ static void intel_pstate_adjust_pstate(s
sample->aperf,
sample->tsc,
get_avg_frequency(cpu),
- fp_toint(cpu->iowait_boost * 100));
+ 0);
}
static void intel_pstate_update_util(struct update_util_data *data, u64 time,
@@ -2317,24 +2291,6 @@ static void intel_pstate_update_util(str
return;
delta_ns = time - cpu->last_update;
- if (flags & SCHED_CPUFREQ_IOWAIT) {
- /* Start over if the CPU may have been idle. */
- if (delta_ns > TICK_NSEC) {
- cpu->iowait_boost = ONE_EIGHTH_FP;
- } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
- cpu->iowait_boost <<= 1;
- if (cpu->iowait_boost > int_tofp(1))
- cpu->iowait_boost = int_tofp(1);
- } else {
- cpu->iowait_boost = ONE_EIGHTH_FP;
- }
- } else if (cpu->iowait_boost) {
- /* Clear iowait_boost if the CPU may have been idle. */
- if (delta_ns > TICK_NSEC)
- cpu->iowait_boost = 0;
- else
- cpu->iowait_boost >>= 1;
- }
cpu->last_update = time;
delta_ns = time - cpu->sample.time;
if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
@@ -2832,7 +2788,7 @@ static void intel_cpufreq_trace(struct c
sample->aperf,
sample->tsc,
get_avg_frequency(cpu),
- fp_toint(cpu->iowait_boost * 100));
+ 0);
}
static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max,

View File

@@ -1,42 +0,0 @@
From fd1e0723b0a7ad140d2bf7cd9154997d5ece2b37 Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Thu, 5 Sep 2024 10:26:44 +0100
Subject: cpufreq: Remove SCHED_CPUFREQ_IOWAIT update
Neither intel_pstate nor schedutil care for the flag anymore, so
remove the update and flag definition.
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
include/linux/sched/cpufreq.h | 2 --
kernel/sched/fair.c | 8 --------
2 files changed, 10 deletions(-)
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -8,8 +8,6 @@
* Interface between cpufreq drivers and the scheduler:
*/
-#define SCHED_CPUFREQ_IOWAIT (1U << 0)
-
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6768,14 +6768,6 @@ enqueue_task_fair(struct rq *rq, struct
*/
util_est_enqueue(&rq->cfs, p);
- /*
- * If in_iowait is set, the code below may not trigger any cpufreq
- * utilization updates, so do it here explicitly with the IOWAIT flag
- * passed.
- */
- if (p->in_iowait)
- cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
-
for_each_sched_entity(se) {
if (se->on_rq)
break;

View File

@@ -1,55 +0,0 @@
From 30cdb8d7d06f51bb86142c537ea05bd01c31bb40 Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Thu, 5 Sep 2024 10:26:45 +0100
Subject: io_uring: Do not set iowait before sleeping
Setting in_iowait was introduced in commit
8a796565cec3 ("io_uring: Use io_schedule* in cqring wait")
to tackle a perf regression that was caused by menu taking iowait into
account for synchronous IO and thus not selecting deeper states like in
the io_uring counterpart.
That behaviour is gone, so the workaround can be removed.
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
io_uring/io_uring.c | 17 -----------------
1 file changed, 17 deletions(-)
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2359,15 +2359,6 @@ int io_run_task_work_sig(struct io_ring_
return 0;
}
-static bool current_pending_io(void)
-{
- struct io_uring_task *tctx = current->io_uring;
-
- if (!tctx)
- return false;
- return percpu_counter_read_positive(&tctx->inflight);
-}
-
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
@@ -2385,19 +2376,11 @@ static inline int io_cqring_wait_schedul
if (unlikely(io_should_wake(iowq)))
return 0;
- /*
- * Mark us as being in io_wait if we have pending requests, so cpufreq
- * can take into account that the task is waiting for IO - turns out
- * to be important for low QD IO.
- */
- if (current_pending_io())
- current->in_iowait = 1;
ret = 0;
if (iowq->timeout == KTIME_MAX)
schedule();
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
ret = -ETIME;
- current->in_iowait = 0;
return ret;
}

View File

@@ -1,4 +1,4 @@
From e6b67a8d14e86d63062e6f1f234c5afc235561d4 Mon Sep 17 00:00:00 2001
From 0a957679a29a06fb2e3971615ff9f05f6becb941 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Oct 2024 21:06:49 -0700
Subject: crypto: x86/crc32c - simplify code for handling fewer than 200 bytes

View File

@@ -1,4 +1,4 @@
From 430478d63b1403878f2fd4b12de2cd21ee502184 Mon Sep 17 00:00:00 2001
From 3ed4205afe9305d71d055554ba27e7b8923865dc Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Oct 2024 21:06:49 -0700
Subject: crypto: x86/crc32c - access 32-bit arguments as 32-bit

View File

@@ -1,4 +1,4 @@
From 8706bf3e3cba8c708f9933f0d1c6a23f9c2c8c33 Mon Sep 17 00:00:00 2001
From 5ffad9b234995f73548763a8487ecd256bba8d8d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Oct 2024 21:06:49 -0700
Subject: crypto: x86/crc32c - eliminate jump table and excessive unrolling

View File

@@ -11,7 +11,7 @@ avoid having to enable `CONFIG_EXPERT`.
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1050,7 +1050,7 @@ config ARCH_MMAP_RND_BITS
@@ -1089,7 +1089,7 @@ config ARCH_MMAP_RND_BITS
int "Number of bits to use for ASLR of mmap base address" if EXPERT
range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
@@ -20,7 +20,7 @@ avoid having to enable `CONFIG_EXPERT`.
depends on HAVE_ARCH_MMAP_RND_BITS
help
This value can be used to select the number of bits to use to
@@ -1084,7 +1084,7 @@ config ARCH_MMAP_RND_COMPAT_BITS
@@ -1123,7 +1123,7 @@ config ARCH_MMAP_RND_COMPAT_BITS
int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT

View File

@@ -1,112 +0,0 @@
From b7d96c1f19ef15ea431a8d5d7ab2cad22c35edba Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Sun, 28 Jul 2024 20:26:59 +0100
Subject: cpufreq: Remove LATENCY_MULTIPLIER
The current LATENCY_MULTIPLIER which has been around for nearly 20 years
causes rate_limit_us to be always in ms range.
On M1 mac mini I get 50 and 56us transition latency, but due to the 1000
multiplier we end up setting rate_limit_us to 50 and 56ms, which gets
capped into 2ms and was 10ms before e13aa799c2a6 ("cpufreq: Change
default transition delay to 2ms")
On Intel I5 system transition latency is 20us but due to the multiplier
we end up with 20ms that again is capped to 2ms.
Given how good modern hardware and how modern workloads require systems
to be more responsive to cater for sudden changes in workload (tasks
sleeping/wakeup/migrating, uclamp causing a sudden boost or cap) and
that 2ms is quarter of the time of 120Hz refresh rate system, drop the
old logic in favour of providing 50% headroom.
rate_limit_us = 1.5 * latency.
I considered not adding any headroom which could mean that we can end up
with infinite back-to-back requests.
I also considered providing a constant headroom (e.g: 100us) assuming
that any h/w or f/w dealing with the request shouldn't require a large
headroom when transition_latency is actually high.
But for both cases I wasn't sure if h/w or f/w can end up being
overwhelmed dealing with the freq requests in a potentially busy system.
So I opted for providing 50% breathing room.
This is expected to impact schedutil only as the other user,
dbs_governor, takes the max(2*tick, transition_delay_us) and the former
was at least 2ms on 1ms TICK, which is equivalent to the max_delay_us
before applying this patch. For systems with TICK of 4ms, this value
would have almost always ended up with 8ms sampling rate.
For systems that report 0 transition latency, we still default to
returning 1ms as transition delay.
This helps in eliminating a source of latency for applying requests as
mentioned in [1]. For example if we have a 1ms tick, most systems will
miss sending an update at tick when updating the util_avg for a task/CPU
(rate_limit_us will be 2ms for most systems).
Link: https://lore.kernel.org/lkml/20240724212255.mfr2ybiv2j2uqek7@airbuntu/ # [1]
Link: https://lore.kernel.org/lkml/20240205022500.2232124-1-qyousef@layalina.io/
Signed-off-by: Qais Yousef <qyousef@layalina.io>
Link: https://patch.msgid.link/20240728192659.58115-1-qyousef@layalina.io
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
drivers/cpufreq/cpufreq.c | 27 ++++-----------------------
include/linux/cpufreq.h | 6 ------
2 files changed, 4 insertions(+), 29 deletions(-)
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -575,30 +575,11 @@ unsigned int cpufreq_policy_transition_d
return policy->transition_delay_us;
latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- if (latency) {
- unsigned int max_delay_us = 2 * MSEC_PER_SEC;
+ if (latency)
+ /* Give a 50% breathing room between updates */
+ return latency + (latency >> 1);
- /*
- * If the platform already has high transition_latency, use it
- * as-is.
- */
- if (latency > max_delay_us)
- return latency;
-
- /*
- * For platforms that can change the frequency very fast (< 2
- * us), the above formula gives a decent transition delay. But
- * for platforms where transition_latency is in milliseconds, it
- * ends up giving unrealistic values.
- *
- * Cap the default transition delay to 2 ms, which seems to be
- * a reasonable amount of time after which we should reevaluate
- * the frequency.
- */
- return min(latency * LATENCY_MULTIPLIER, max_delay_us);
- }
-
- return LATENCY_MULTIPLIER;
+ return USEC_PER_MSEC;
}
EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scal
#define CPUFREQ_POLICY_POWERSAVE (1)
#define CPUFREQ_POLICY_PERFORMANCE (2)
-/*
- * The polling frequency depends on the capability of the processor. Default
- * polling frequency is 1000 times the transition latency of the processor.
- */
-#define LATENCY_MULTIPLIER (1000)
-
struct cpufreq_governor {
char name[CPUFREQ_NAME_LEN];
int (*init)(struct cpufreq_policy *policy);

View File

@@ -1,26 +0,0 @@
From b97d21a0aa65a6f7a7bb17bbc696b136688c96ed Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 26 Aug 2024 08:50:11 -0400
Subject: nfsd: add more info to WARN_ON_ONCE on failed callbacks
Currently, you get the warning and stack trace, but nothing is printed
about the relevant error codes. Add that in.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/nfsd/nfs4callback.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1333,7 +1333,8 @@ static void nfsd4_cb_done(struct rpc_tas
return;
if (cb->cb_status) {
- WARN_ON_ONCE(task->tk_status);
+ WARN_ONCE(task->tk_status, "cb_status=%d tk_status=%d",
+ cb->cb_status, task->tk_status);
task->tk_status = cb->cb_status;
}

Some files were not shown because too many files have changed in this diff Show More