add 3rd party/custom patches
3rd patchs (in alphabetical order): - bbr3 - ntsync5 - openwrt - pf-kernel - xanmod - zen no configuration changes for now
This commit is contained in:
@@ -0,0 +1,70 @@
|
||||
From 3427331872c37b2edb42406c65764e1565b0591b Mon Sep 17 00:00:00 2001
|
||||
From: Perry Yuan <perry.yuan@amd.com>
|
||||
Date: Fri, 9 Aug 2024 14:09:05 +0800
|
||||
Subject: cpufreq: amd-pstate: add quirk for Ryzen 3000 series processor
|
||||
|
||||
The Ryzen 3000 series processors have been observed lacking the
|
||||
nominal_freq and lowest_freq parameters in their ACPI tables. This
|
||||
absence causes issues with loading the amd-pstate driver on these
|
||||
systems. Introduces a fix to resolve the dependency issue
|
||||
by adding a quirk specifically for the Ryzen 3000 series.
|
||||
|
||||
Reported-by: David Wang <00107082@163.com>
|
||||
Signed-off-by: Perry Yuan <perry.yuan@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 30 ++++++++++++++++++++++++++++++
|
||||
1 file changed, 30 insertions(+)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -142,6 +142,11 @@ static struct quirk_entry quirk_amd_7k62
|
||||
.lowest_freq = 550,
|
||||
};
|
||||
|
||||
+static struct quirk_entry quirk_amd_mts = {
|
||||
+ .nominal_freq = 3600,
|
||||
+ .lowest_freq = 550,
|
||||
+};
|
||||
+
|
||||
static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
|
||||
{
|
||||
/**
|
||||
@@ -158,6 +163,21 @@ static int __init dmi_matched_7k62_bios_
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int __init dmi_matched_mts_bios_bug(const struct dmi_system_id *dmi)
|
||||
+{
|
||||
+ /**
|
||||
+ * match the broken bios for ryzen 3000 series processor support CPPC V2
|
||||
+ * broken BIOS lack of nominal_freq and lowest_freq capabilities
|
||||
+ * definition in ACPI tables
|
||||
+ */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2)) {
|
||||
+ quirks = dmi->driver_data;
|
||||
+ pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = {
|
||||
{
|
||||
.callback = dmi_matched_7k62_bios_bug,
|
||||
@@ -168,6 +188,16 @@ static const struct dmi_system_id amd_ps
|
||||
},
|
||||
.driver_data = &quirk_amd_7k62,
|
||||
},
|
||||
+ {
|
||||
+ .callback = dmi_matched_mts_bios_bug,
|
||||
+ .ident = "AMD Ryzen 3000",
|
||||
+ .matches = {
|
||||
+ DMI_MATCH(DMI_PRODUCT_NAME, "B450M MORTAR MAX (MS-7B89)"),
|
||||
+ DMI_MATCH(DMI_BIOS_RELEASE, "06/10/2020"),
|
||||
+ DMI_MATCH(DMI_BIOS_VERSION, "5.14"),
|
||||
+ },
|
||||
+ .driver_data = &quirk_amd_mts,
|
||||
+ },
|
||||
{}
|
||||
};
|
||||
MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table);
|
@@ -0,0 +1,88 @@
|
||||
From 44f21855901b1fd618ac16b07dbd14e8fea4ee13 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Sat, 31 Aug 2024 21:49:11 -0500
|
||||
Subject: cpufreq/amd-pstate: Export symbols for changing modes
|
||||
|
||||
In order to effectively test all mode switch combinations export
|
||||
everything necessarily for amd-pstate-ut to trigger a mode switch.
|
||||
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 23 ++++++++++-------------
|
||||
drivers/cpufreq/amd-pstate.h | 14 ++++++++++++++
|
||||
2 files changed, 24 insertions(+), 13 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -60,18 +60,6 @@
|
||||
#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF
|
||||
#define AMD_CPPC_EPP_POWERSAVE 0xFF
|
||||
|
||||
-/*
|
||||
- * enum amd_pstate_mode - driver working mode of amd pstate
|
||||
- */
|
||||
-enum amd_pstate_mode {
|
||||
- AMD_PSTATE_UNDEFINED = 0,
|
||||
- AMD_PSTATE_DISABLE,
|
||||
- AMD_PSTATE_PASSIVE,
|
||||
- AMD_PSTATE_ACTIVE,
|
||||
- AMD_PSTATE_GUIDED,
|
||||
- AMD_PSTATE_MAX,
|
||||
-};
|
||||
-
|
||||
static const char * const amd_pstate_mode_string[] = {
|
||||
[AMD_PSTATE_UNDEFINED] = "undefined",
|
||||
[AMD_PSTATE_DISABLE] = "disable",
|
||||
@@ -81,6 +69,14 @@ static const char * const amd_pstate_mod
|
||||
NULL,
|
||||
};
|
||||
|
||||
+const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode)
|
||||
+{
|
||||
+ if (mode < 0 || mode >= AMD_PSTATE_MAX)
|
||||
+ return NULL;
|
||||
+ return amd_pstate_mode_string[mode];
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string);
|
||||
+
|
||||
struct quirk_entry {
|
||||
u32 nominal_freq;
|
||||
u32 lowest_freq;
|
||||
@@ -1392,7 +1388,7 @@ static ssize_t amd_pstate_show_status(ch
|
||||
return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
|
||||
}
|
||||
|
||||
-static int amd_pstate_update_status(const char *buf, size_t size)
|
||||
+int amd_pstate_update_status(const char *buf, size_t size)
|
||||
{
|
||||
int mode_idx;
|
||||
|
||||
@@ -1409,6 +1405,7 @@ static int amd_pstate_update_status(cons
|
||||
|
||||
return 0;
|
||||
}
|
||||
+EXPORT_SYMBOL_GPL(amd_pstate_update_status);
|
||||
|
||||
static ssize_t status_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
--- a/drivers/cpufreq/amd-pstate.h
|
||||
+++ b/drivers/cpufreq/amd-pstate.h
|
||||
@@ -103,4 +103,18 @@ struct amd_cpudata {
|
||||
bool boost_state;
|
||||
};
|
||||
|
||||
+/*
|
||||
+ * enum amd_pstate_mode - driver working mode of amd pstate
|
||||
+ */
|
||||
+enum amd_pstate_mode {
|
||||
+ AMD_PSTATE_UNDEFINED = 0,
|
||||
+ AMD_PSTATE_DISABLE,
|
||||
+ AMD_PSTATE_PASSIVE,
|
||||
+ AMD_PSTATE_ACTIVE,
|
||||
+ AMD_PSTATE_GUIDED,
|
||||
+ AMD_PSTATE_MAX,
|
||||
+};
|
||||
+const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode);
|
||||
+int amd_pstate_update_status(const char *buf, size_t size);
|
||||
+
|
||||
#endif /* _LINUX_AMD_PSTATE_H */
|
@@ -0,0 +1,77 @@
|
||||
From aabfc7370a7da9c52be97c79ba70a20201e6864a Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Sat, 31 Aug 2024 21:49:12 -0500
|
||||
Subject: cpufreq/amd-pstate-ut: Add test case for mode switches
|
||||
|
||||
There is a state machine in the amd-pstate driver utilized for
|
||||
switches for all modes. To make sure that cleanup and setup works
|
||||
properly for each mode add a unit test case that tries all
|
||||
combinations.
|
||||
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 41 ++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 40 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -54,12 +54,14 @@ static void amd_pstate_ut_acpi_cpc_valid
|
||||
static void amd_pstate_ut_check_enabled(u32 index);
|
||||
static void amd_pstate_ut_check_perf(u32 index);
|
||||
static void amd_pstate_ut_check_freq(u32 index);
|
||||
+static void amd_pstate_ut_check_driver(u32 index);
|
||||
|
||||
static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = {
|
||||
{"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid },
|
||||
{"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled },
|
||||
{"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf },
|
||||
- {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }
|
||||
+ {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq },
|
||||
+ {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver }
|
||||
};
|
||||
|
||||
static bool get_shared_mem(void)
|
||||
@@ -257,6 +259,43 @@ skip_test:
|
||||
cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
+static int amd_pstate_set_mode(enum amd_pstate_mode mode)
|
||||
+{
|
||||
+ const char *mode_str = amd_pstate_get_mode_string(mode);
|
||||
+
|
||||
+ pr_debug("->setting mode to %s\n", mode_str);
|
||||
+
|
||||
+ return amd_pstate_update_status(mode_str, strlen(mode_str));
|
||||
+}
|
||||
+
|
||||
+static void amd_pstate_ut_check_driver(u32 index)
|
||||
+{
|
||||
+ enum amd_pstate_mode mode1, mode2;
|
||||
+ int ret;
|
||||
+
|
||||
+ for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {
|
||||
+ ret = amd_pstate_set_mode(mode1);
|
||||
+ if (ret)
|
||||
+ goto out;
|
||||
+ for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) {
|
||||
+ if (mode1 == mode2)
|
||||
+ continue;
|
||||
+ ret = amd_pstate_set_mode(mode2);
|
||||
+ if (ret)
|
||||
+ goto out;
|
||||
+ }
|
||||
+ }
|
||||
+out:
|
||||
+ if (ret)
|
||||
+ pr_warn("%s: failed to update status for %s->%s: %d\n", __func__,
|
||||
+ amd_pstate_get_mode_string(mode1),
|
||||
+ amd_pstate_get_mode_string(mode2), ret);
|
||||
+
|
||||
+ amd_pstate_ut_cases[index].result = ret ?
|
||||
+ AMD_PSTATE_UT_RESULT_FAIL :
|
||||
+ AMD_PSTATE_UT_RESULT_PASS;
|
||||
+}
|
||||
+
|
||||
static int __init amd_pstate_ut_init(void)
|
||||
{
|
||||
u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases);
|
@@ -0,0 +1,60 @@
|
||||
From 24e62fbc101d079d398ac6fc76f458676d3d9491 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Sun, 1 Sep 2024 00:00:35 -0500
|
||||
Subject: cpufreq/amd-pstate: Catch failures for amd_pstate_epp_update_limit()
|
||||
|
||||
amd_pstate_set_epp() calls cppc_set_epp_perf() which can fail for
|
||||
a variety of reasons but this is ignored. Change the return flow
|
||||
to allow failures.
|
||||
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 11 +++++++----
|
||||
1 file changed, 7 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1595,7 +1595,7 @@ static void amd_pstate_epp_cpu_exit(stru
|
||||
pr_debug("CPU %d exiting\n", policy->cpu);
|
||||
}
|
||||
|
||||
-static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
|
||||
+static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
|
||||
@@ -1645,7 +1645,7 @@ static void amd_pstate_epp_update_limit(
|
||||
* This return value can only be negative for shared_memory
|
||||
* systems where EPP register read/write not supported.
|
||||
*/
|
||||
- return;
|
||||
+ return epp;
|
||||
}
|
||||
|
||||
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
@@ -1658,12 +1658,13 @@ static void amd_pstate_epp_update_limit(
|
||||
}
|
||||
|
||||
WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
- amd_pstate_set_epp(cpudata, epp);
|
||||
+ return amd_pstate_set_epp(cpudata, epp);
|
||||
}
|
||||
|
||||
static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
+ int ret;
|
||||
|
||||
if (!policy->cpuinfo.max_freq)
|
||||
return -ENODEV;
|
||||
@@ -1673,7 +1674,9 @@ static int amd_pstate_epp_set_policy(str
|
||||
|
||||
cpudata->policy = policy->policy;
|
||||
|
||||
- amd_pstate_epp_update_limit(policy);
|
||||
+ ret = amd_pstate_epp_update_limit(policy);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
|
||||
/*
|
||||
* policy->cur is never updated with the amd_pstate_epp driver, but it
|
@@ -0,0 +1,67 @@
|
||||
From 29c0347dd542e091e2f7e5980dd885f918f5f676 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:29:57 -0500
|
||||
Subject: x86/amd: Move amd_get_highest_perf() from amd.c to cppc.c
|
||||
|
||||
To prepare to let amd_get_highest_perf() detect preferred cores
|
||||
it will require CPPC functions. Move amd_get_highest_perf() to
|
||||
cppc.c to prepare for 'preferred core detection' rework.
|
||||
|
||||
No functional changes intended.
|
||||
|
||||
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
---
|
||||
arch/x86/kernel/acpi/cppc.c | 16 ++++++++++++++++
|
||||
arch/x86/kernel/cpu/amd.c | 16 ----------------
|
||||
2 files changed, 16 insertions(+), 16 deletions(-)
|
||||
|
||||
--- a/arch/x86/kernel/acpi/cppc.c
|
||||
+++ b/arch/x86/kernel/acpi/cppc.c
|
||||
@@ -116,3 +116,19 @@ void init_freq_invariance_cppc(void)
|
||||
init_done = true;
|
||||
mutex_unlock(&freq_invariance_lock);
|
||||
}
|
||||
+
|
||||
+u32 amd_get_highest_perf(void)
|
||||
+{
|
||||
+ struct cpuinfo_x86 *c = &boot_cpu_data;
|
||||
+
|
||||
+ if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
|
||||
+ (c->x86_model >= 0x70 && c->x86_model < 0x80)))
|
||||
+ return 166;
|
||||
+
|
||||
+ if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
|
||||
+ (c->x86_model >= 0x40 && c->x86_model < 0x70)))
|
||||
+ return 166;
|
||||
+
|
||||
+ return 255;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
|
||||
--- a/arch/x86/kernel/cpu/amd.c
|
||||
+++ b/arch/x86/kernel/cpu/amd.c
|
||||
@@ -1190,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsig
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
|
||||
|
||||
-u32 amd_get_highest_perf(void)
|
||||
-{
|
||||
- struct cpuinfo_x86 *c = &boot_cpu_data;
|
||||
-
|
||||
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
|
||||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
|
||||
- return 166;
|
||||
-
|
||||
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
|
||||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
|
||||
- return 166;
|
||||
-
|
||||
- return 255;
|
||||
-}
|
||||
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
|
||||
-
|
||||
static void zenbleed_check_cpu(void *unused)
|
||||
{
|
||||
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
|
@@ -0,0 +1,95 @@
|
||||
From 072efeb45349edd8ba9def11b6a450eaf56690a8 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:29:58 -0500
|
||||
Subject: ACPI: CPPC: Adjust return code for inline functions in
|
||||
!CONFIG_ACPI_CPPC_LIB
|
||||
|
||||
Checkpath emits the following warning:
|
||||
```
|
||||
WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP
|
||||
```
|
||||
|
||||
Adjust the code accordingly.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
---
|
||||
include/acpi/cppc_acpi.h | 26 +++++++++++++-------------
|
||||
1 file changed, 13 insertions(+), 13 deletions(-)
|
||||
|
||||
--- a/include/acpi/cppc_acpi.h
|
||||
+++ b/include/acpi/cppc_acpi.h
|
||||
@@ -164,31 +164,31 @@ extern int cppc_set_auto_sel(int cpu, bo
|
||||
#else /* !CONFIG_ACPI_CPPC_LIB */
|
||||
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_set_enable(int cpu, bool enable)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline bool cppc_perf_ctrs_in_pcc(void)
|
||||
{
|
||||
@@ -212,27 +212,27 @@ static inline bool cpc_ffh_supported(voi
|
||||
}
|
||||
static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_set_auto_sel(int cpu, bool enable)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
|
||||
{
|
||||
- return -ENOTSUPP;
|
||||
+ return -EOPNOTSUPP;
|
||||
}
|
||||
#endif /* !CONFIG_ACPI_CPPC_LIB */
|
||||
|
@@ -0,0 +1,162 @@
|
||||
From 21492d91ffc7c3fdb6507f64a74abf8326c75141 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:29:59 -0500
|
||||
Subject: x86/amd: Rename amd_get_highest_perf() to
|
||||
amd_get_boost_ratio_numerator()
|
||||
|
||||
The function name is ambiguous because it returns an intermediate value
|
||||
for calculating maximum frequency rather than the CPPC 'Highest Perf'
|
||||
register.
|
||||
|
||||
Rename the function to clarify its use and allow the function to return
|
||||
errors. Adjust the consumer in acpi-cpufreq to catch errors.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
---
|
||||
arch/x86/include/asm/processor.h | 3 ---
|
||||
arch/x86/kernel/acpi/cppc.c | 44 +++++++++++++++++++++++---------
|
||||
drivers/cpufreq/acpi-cpufreq.c | 12 ++++++---
|
||||
include/acpi/cppc_acpi.h | 5 ++++
|
||||
4 files changed, 46 insertions(+), 18 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/processor.h
|
||||
+++ b/arch/x86/include/asm/processor.h
|
||||
@@ -691,8 +691,6 @@ static inline u32 per_cpu_l2c_id(unsigne
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CPU_SUP_AMD
|
||||
-extern u32 amd_get_highest_perf(void);
|
||||
-
|
||||
/*
|
||||
* Issue a DIV 0/1 insn to clear any division data from previous DIV
|
||||
* operations.
|
||||
@@ -705,7 +703,6 @@ static __always_inline void amd_clear_di
|
||||
|
||||
extern void amd_check_microcode(void);
|
||||
#else
|
||||
-static inline u32 amd_get_highest_perf(void) { return 0; }
|
||||
static inline void amd_clear_divider(void) { }
|
||||
static inline void amd_check_microcode(void) { }
|
||||
#endif
|
||||
--- a/arch/x86/kernel/acpi/cppc.c
|
||||
+++ b/arch/x86/kernel/acpi/cppc.c
|
||||
@@ -69,7 +69,7 @@ int cpc_write_ffh(int cpunum, struct cpc
|
||||
static void amd_set_max_freq_ratio(void)
|
||||
{
|
||||
struct cppc_perf_caps perf_caps;
|
||||
- u64 highest_perf, nominal_perf;
|
||||
+ u64 numerator, nominal_perf;
|
||||
u64 perf_ratio;
|
||||
int rc;
|
||||
|
||||
@@ -79,15 +79,19 @@ static void amd_set_max_freq_ratio(void)
|
||||
return;
|
||||
}
|
||||
|
||||
- highest_perf = amd_get_highest_perf();
|
||||
+ rc = amd_get_boost_ratio_numerator(0, &numerator);
|
||||
+ if (rc) {
|
||||
+ pr_debug("Could not retrieve highest performance (%d)\n", rc);
|
||||
+ return;
|
||||
+ }
|
||||
nominal_perf = perf_caps.nominal_perf;
|
||||
|
||||
- if (!highest_perf || !nominal_perf) {
|
||||
- pr_debug("Could not retrieve highest or nominal performance\n");
|
||||
+ if (!nominal_perf) {
|
||||
+ pr_debug("Could not retrieve nominal performance\n");
|
||||
return;
|
||||
}
|
||||
|
||||
- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
|
||||
+ perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
|
||||
/* midpoint between max_boost and max_P */
|
||||
perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
|
||||
if (!perf_ratio) {
|
||||
@@ -117,18 +121,34 @@ void init_freq_invariance_cppc(void)
|
||||
mutex_unlock(&freq_invariance_lock);
|
||||
}
|
||||
|
||||
-u32 amd_get_highest_perf(void)
|
||||
+/**
|
||||
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
|
||||
+ * @cpu: CPU to get numerator for.
|
||||
+ * @numerator: Output variable for numerator.
|
||||
+ *
|
||||
+ * Determine the numerator to use for calculating the boost ratio on
|
||||
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
|
||||
+ * value. On other systems this will the highest performance register value.
|
||||
+ *
|
||||
+ * Return: 0 for success, negative error code otherwise.
|
||||
+ */
|
||||
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
|
||||
{
|
||||
struct cpuinfo_x86 *c = &boot_cpu_data;
|
||||
|
||||
if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
|
||||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
|
||||
- return 166;
|
||||
+ (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
|
||||
+ *numerator = 166;
|
||||
+ return 0;
|
||||
+ }
|
||||
|
||||
if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
|
||||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
|
||||
- return 166;
|
||||
+ (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
|
||||
+ *numerator = 166;
|
||||
+ return 0;
|
||||
+ }
|
||||
+ *numerator = 255;
|
||||
|
||||
- return 255;
|
||||
+ return 0;
|
||||
}
|
||||
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
|
||||
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
|
||||
--- a/drivers/cpufreq/acpi-cpufreq.c
|
||||
+++ b/drivers/cpufreq/acpi-cpufreq.c
|
||||
@@ -642,10 +642,16 @@ static u64 get_max_boost_ratio(unsigned
|
||||
return 0;
|
||||
}
|
||||
|
||||
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
|
||||
- highest_perf = amd_get_highest_perf();
|
||||
- else
|
||||
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
|
||||
+ ret = amd_get_boost_ratio_numerator(cpu, &highest_perf);
|
||||
+ if (ret) {
|
||||
+ pr_debug("CPU%d: Unable to get boost ratio numerator (%d)\n",
|
||||
+ cpu, ret);
|
||||
+ return 0;
|
||||
+ }
|
||||
+ } else {
|
||||
highest_perf = perf_caps.highest_perf;
|
||||
+ }
|
||||
|
||||
nominal_perf = perf_caps.nominal_perf;
|
||||
|
||||
--- a/include/acpi/cppc_acpi.h
|
||||
+++ b/include/acpi/cppc_acpi.h
|
||||
@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum,
|
||||
extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
|
||||
extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
|
||||
extern int cppc_set_auto_sel(int cpu, bool enable);
|
||||
+extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
|
||||
#else /* !CONFIG_ACPI_CPPC_LIB */
|
||||
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
|
||||
{
|
||||
@@ -234,6 +235,10 @@ static inline int cppc_get_auto_sel_caps
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
+static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
|
||||
+{
|
||||
+ return -EOPNOTSUPP;
|
||||
+}
|
||||
#endif /* !CONFIG_ACPI_CPPC_LIB */
|
||||
|
||||
#endif /* _CPPC_ACPI_H*/
|
35
debian/patches/patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch
vendored
Normal file
35
debian/patches/patchset-pf/amd-pstate/0008-ACPI-CPPC-Drop-check-for-non-zero-perf-ratio.patch
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
From 6f10d066dce0f1781b514a0352f0b427a32b1bb2 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:30:00 -0500
|
||||
Subject: ACPI: CPPC: Drop check for non zero perf ratio
|
||||
|
||||
perf_ratio is a u64 and SCHED_CAPACITY_SCALE is a large number.
|
||||
Shifting by one will never have a zero value.
|
||||
|
||||
Drop the check.
|
||||
|
||||
Suggested-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
---
|
||||
arch/x86/kernel/acpi/cppc.c | 7 +------
|
||||
1 file changed, 1 insertion(+), 6 deletions(-)
|
||||
|
||||
--- a/arch/x86/kernel/acpi/cppc.c
|
||||
+++ b/arch/x86/kernel/acpi/cppc.c
|
||||
@@ -91,13 +91,8 @@ static void amd_set_max_freq_ratio(void)
|
||||
return;
|
||||
}
|
||||
|
||||
- perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
|
||||
/* midpoint between max_boost and max_P */
|
||||
- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
|
||||
- if (!perf_ratio) {
|
||||
- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
|
||||
- return;
|
||||
- }
|
||||
+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
|
||||
|
||||
freq_invariance_set_perf_ratio(perf_ratio, false);
|
||||
}
|
@@ -0,0 +1,44 @@
|
||||
From 8c142a91a58f24119e99d4e66b11890f4a4ef984 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:30:01 -0500
|
||||
Subject: ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn
|
||||
|
||||
If the boost ratio isn't calculated properly for the system for any
|
||||
reason this can cause other problems that are non-obvious.
|
||||
|
||||
Raise all messages to warn instead.
|
||||
|
||||
Suggested-by: Perry Yuan <Perry.Yuan@amd.com>
|
||||
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
---
|
||||
arch/x86/kernel/acpi/cppc.c | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/arch/x86/kernel/acpi/cppc.c
|
||||
+++ b/arch/x86/kernel/acpi/cppc.c
|
||||
@@ -75,19 +75,19 @@ static void amd_set_max_freq_ratio(void)
|
||||
|
||||
rc = cppc_get_perf_caps(0, &perf_caps);
|
||||
if (rc) {
|
||||
- pr_debug("Could not retrieve perf counters (%d)\n", rc);
|
||||
+ pr_warn("Could not retrieve perf counters (%d)\n", rc);
|
||||
return;
|
||||
}
|
||||
|
||||
rc = amd_get_boost_ratio_numerator(0, &numerator);
|
||||
if (rc) {
|
||||
- pr_debug("Could not retrieve highest performance (%d)\n", rc);
|
||||
+ pr_warn("Could not retrieve highest performance (%d)\n", rc);
|
||||
return;
|
||||
}
|
||||
nominal_perf = perf_caps.nominal_perf;
|
||||
|
||||
if (!nominal_perf) {
|
||||
- pr_debug("Could not retrieve nominal performance\n");
|
||||
+ pr_warn("Could not retrieve nominal performance\n");
|
||||
return;
|
||||
}
|
||||
|
138
debian/patches/patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch
vendored
Normal file
138
debian/patches/patchset-pf/amd-pstate/0010-x86-amd-Move-amd_get_highest_perf-out-of-amd-pstate.patch
vendored
Normal file
@@ -0,0 +1,138 @@
|
||||
From 952e7bdc4cf67603f230f8eb91818ad4676e5a83 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:30:02 -0500
|
||||
Subject: x86/amd: Move amd_get_highest_perf() out of amd-pstate
|
||||
|
||||
amd_pstate_get_highest_perf() is a helper used to get the highest perf
|
||||
value on AMD systems. It's used in amd-pstate as part of preferred
|
||||
core handling, but applicable for acpi-cpufreq as well.
|
||||
|
||||
Move it out to cppc handling code as amd_get_highest_perf().
|
||||
|
||||
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
---
|
||||
arch/x86/kernel/acpi/cppc.c | 30 ++++++++++++++++++++++++++++++
|
||||
drivers/cpufreq/amd-pstate.c | 34 ++--------------------------------
|
||||
include/acpi/cppc_acpi.h | 5 +++++
|
||||
3 files changed, 37 insertions(+), 32 deletions(-)
|
||||
|
||||
--- a/arch/x86/kernel/acpi/cppc.c
|
||||
+++ b/arch/x86/kernel/acpi/cppc.c
|
||||
@@ -116,6 +116,36 @@ void init_freq_invariance_cppc(void)
|
||||
mutex_unlock(&freq_invariance_lock);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Get the highest performance register value.
|
||||
+ * @cpu: CPU from which to get highest performance.
|
||||
+ * @highest_perf: Return address for highest performance value.
|
||||
+ *
|
||||
+ * Return: 0 for success, negative error code otherwise.
|
||||
+ */
|
||||
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
|
||||
+{
|
||||
+ u64 val;
|
||||
+ int ret;
|
||||
+
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
|
||||
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
|
||||
+ if (ret)
|
||||
+ goto out;
|
||||
+
|
||||
+ val = AMD_CPPC_HIGHEST_PERF(val);
|
||||
+ } else {
|
||||
+ ret = cppc_get_highest_perf(cpu, &val);
|
||||
+ if (ret)
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ WRITE_ONCE(*highest_perf, (u32)val);
|
||||
+out:
|
||||
+ return ret;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
|
||||
+
|
||||
/**
|
||||
* amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
|
||||
* @cpu: CPU to get numerator for.
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -837,36 +837,6 @@ static void amd_pstste_sched_prefcore_wo
|
||||
}
|
||||
static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
|
||||
|
||||
-/*
|
||||
- * Get the highest performance register value.
|
||||
- * @cpu: CPU from which to get highest performance.
|
||||
- * @highest_perf: Return address.
|
||||
- *
|
||||
- * Return: 0 for success, -EIO otherwise.
|
||||
- */
|
||||
-static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf)
|
||||
-{
|
||||
- int ret;
|
||||
-
|
||||
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
|
||||
- u64 cap1;
|
||||
-
|
||||
- ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
- WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
|
||||
- } else {
|
||||
- u64 cppc_highest_perf;
|
||||
-
|
||||
- ret = cppc_get_highest_perf(cpu, &cppc_highest_perf);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
- WRITE_ONCE(*highest_perf, cppc_highest_perf);
|
||||
- }
|
||||
-
|
||||
- return (ret);
|
||||
-}
|
||||
-
|
||||
#define CPPC_MAX_PERF U8_MAX
|
||||
|
||||
static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
|
||||
@@ -874,7 +844,7 @@ static void amd_pstate_init_prefcore(str
|
||||
int ret, prio;
|
||||
u32 highest_perf;
|
||||
|
||||
- ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf);
|
||||
+ ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
@@ -918,7 +888,7 @@ static void amd_pstate_update_limits(uns
|
||||
if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
|
||||
goto free_cpufreq_put;
|
||||
|
||||
- ret = amd_pstate_get_highest_perf(cpu, &cur_high);
|
||||
+ ret = amd_get_highest_perf(cpu, &cur_high);
|
||||
if (ret)
|
||||
goto free_cpufreq_put;
|
||||
|
||||
--- a/include/acpi/cppc_acpi.h
|
||||
+++ b/include/acpi/cppc_acpi.h
|
||||
@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum,
|
||||
extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
|
||||
extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
|
||||
extern int cppc_set_auto_sel(int cpu, bool enable);
|
||||
+extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
|
||||
extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
|
||||
#else /* !CONFIG_ACPI_CPPC_LIB */
|
||||
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
|
||||
@@ -235,6 +236,10 @@ static inline int cppc_get_auto_sel_caps
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
+static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
|
||||
+{
|
||||
+ return -ENODEV;
|
||||
+}
|
||||
static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
@@ -0,0 +1,251 @@
|
||||
From 3ab7da5bbf2087982dbfe2b0f2937d0dddc3afb1 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:30:03 -0500
|
||||
Subject: x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()
|
||||
|
||||
AMD systems that support preferred cores will use "166" as their
|
||||
numerator for max frequency calculations instead of "255".
|
||||
|
||||
Add a function for detecting preferred cores by looking at the
|
||||
highest perf value on all cores.
|
||||
|
||||
If preferred cores are enabled return 166 and if disabled the
|
||||
value in the highest perf register. As the function will be called
|
||||
multiple times, cache the values for the boost numerator and if
|
||||
preferred cores will be enabled in global variables.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
arch/x86/kernel/acpi/cppc.c | 93 ++++++++++++++++++++++++++++++++----
|
||||
drivers/cpufreq/amd-pstate.c | 34 +++++--------
|
||||
include/acpi/cppc_acpi.h | 5 ++
|
||||
3 files changed, 101 insertions(+), 31 deletions(-)
|
||||
|
||||
--- a/arch/x86/kernel/acpi/cppc.c
|
||||
+++ b/arch/x86/kernel/acpi/cppc.c
|
||||
@@ -9,6 +9,16 @@
|
||||
#include <asm/processor.h>
|
||||
#include <asm/topology.h>
|
||||
|
||||
+#define CPPC_HIGHEST_PERF_PREFCORE 166
|
||||
+
|
||||
+enum amd_pref_core {
|
||||
+ AMD_PREF_CORE_UNKNOWN = 0,
|
||||
+ AMD_PREF_CORE_SUPPORTED,
|
||||
+ AMD_PREF_CORE_UNSUPPORTED,
|
||||
+};
|
||||
+static enum amd_pref_core amd_pref_core_detected;
|
||||
+static u64 boost_numerator;
|
||||
+
|
||||
/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
|
||||
|
||||
bool cpc_supported_by_cpu(void)
|
||||
@@ -147,6 +157,66 @@ out:
|
||||
EXPORT_SYMBOL_GPL(amd_get_highest_perf);
|
||||
|
||||
/**
|
||||
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
|
||||
+ * @detected: Output variable for the result of the detection.
|
||||
+ *
|
||||
+ * Determine whether CPUs in the system support preferred cores. On systems
|
||||
+ * that support preferred cores, different highest perf values will be found
|
||||
+ * on different cores. On other systems, the highest perf value will be the
|
||||
+ * same on all cores.
|
||||
+ *
|
||||
+ * The result of the detection will be stored in the 'detected' parameter.
|
||||
+ *
|
||||
+ * Return: 0 for success, negative error code otherwise
|
||||
+ */
|
||||
+int amd_detect_prefcore(bool *detected)
|
||||
+{
|
||||
+ int cpu, count = 0;
|
||||
+ u64 highest_perf[2] = {0};
|
||||
+
|
||||
+ if (WARN_ON(!detected))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ switch (amd_pref_core_detected) {
|
||||
+ case AMD_PREF_CORE_SUPPORTED:
|
||||
+ *detected = true;
|
||||
+ return 0;
|
||||
+ case AMD_PREF_CORE_UNSUPPORTED:
|
||||
+ *detected = false;
|
||||
+ return 0;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ for_each_present_cpu(cpu) {
|
||||
+ u32 tmp;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = amd_get_highest_perf(cpu, &tmp);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
+ if (!count || (count == 1 && tmp != highest_perf[0]))
|
||||
+ highest_perf[count++] = tmp;
|
||||
+
|
||||
+ if (count == 2)
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ *detected = (count == 2);
|
||||
+ boost_numerator = highest_perf[0];
|
||||
+
|
||||
+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
|
||||
+ AMD_PREF_CORE_UNSUPPORTED;
|
||||
+
|
||||
+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
|
||||
+ *detected ? "" : "un", highest_perf[0]);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
|
||||
+
|
||||
+/**
|
||||
* amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
|
||||
* @cpu: CPU to get numerator for.
|
||||
* @numerator: Output variable for numerator.
|
||||
@@ -155,24 +225,27 @@ EXPORT_SYMBOL_GPL(amd_get_highest_perf);
|
||||
* a CPU. On systems that support preferred cores, this will be a hardcoded
|
||||
* value. On other systems this will the highest performance register value.
|
||||
*
|
||||
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
|
||||
+ * the correct boost numerator will be returned to match hardware capabilities
|
||||
+ * even if the preferred cores scheduling hints are not enabled.
|
||||
+ *
|
||||
* Return: 0 for success, negative error code otherwise.
|
||||
*/
|
||||
int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
|
||||
{
|
||||
- struct cpuinfo_x86 *c = &boot_cpu_data;
|
||||
-
|
||||
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
|
||||
- (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
|
||||
- *numerator = 166;
|
||||
- return 0;
|
||||
- }
|
||||
+ bool prefcore;
|
||||
+ int ret;
|
||||
|
||||
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
|
||||
- (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
|
||||
- *numerator = 166;
|
||||
+ ret = amd_detect_prefcore(&prefcore);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
+ /* without preferred cores, return the highest perf register value */
|
||||
+ if (!prefcore) {
|
||||
+ *numerator = boost_numerator;
|
||||
return 0;
|
||||
}
|
||||
- *numerator = 255;
|
||||
+ *numerator = CPPC_HIGHEST_PERF_PREFCORE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -841,32 +841,18 @@ static DECLARE_WORK(sched_prefcore_work,
|
||||
|
||||
static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
|
||||
{
|
||||
- int ret, prio;
|
||||
- u32 highest_perf;
|
||||
-
|
||||
- ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
|
||||
- if (ret)
|
||||
+ /* user disabled or not detected */
|
||||
+ if (!amd_pstate_prefcore)
|
||||
return;
|
||||
|
||||
cpudata->hw_prefcore = true;
|
||||
- /* check if CPPC preferred core feature is enabled*/
|
||||
- if (highest_perf < CPPC_MAX_PERF)
|
||||
- prio = (int)highest_perf;
|
||||
- else {
|
||||
- pr_debug("AMD CPPC preferred core is unsupported!\n");
|
||||
- cpudata->hw_prefcore = false;
|
||||
- return;
|
||||
- }
|
||||
-
|
||||
- if (!amd_pstate_prefcore)
|
||||
- return;
|
||||
|
||||
/*
|
||||
* The priorities can be set regardless of whether or not
|
||||
* sched_set_itmt_support(true) has been called and it is valid to
|
||||
* update them at any time after it has been called.
|
||||
*/
|
||||
- sched_set_itmt_core_prio(prio, cpudata->cpu);
|
||||
+ sched_set_itmt_core_prio((int)READ_ONCE(cpudata->highest_perf), cpudata->cpu);
|
||||
|
||||
schedule_work(&sched_prefcore_work);
|
||||
}
|
||||
@@ -1037,12 +1023,12 @@ static int amd_pstate_cpu_init(struct cp
|
||||
|
||||
cpudata->cpu = policy->cpu;
|
||||
|
||||
- amd_pstate_init_prefcore(cpudata);
|
||||
-
|
||||
ret = amd_pstate_init_perf(cpudata);
|
||||
if (ret)
|
||||
goto free_cpudata1;
|
||||
|
||||
+ amd_pstate_init_prefcore(cpudata);
|
||||
+
|
||||
ret = amd_pstate_init_freq(cpudata);
|
||||
if (ret)
|
||||
goto free_cpudata1;
|
||||
@@ -1493,12 +1479,12 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
cpudata->cpu = policy->cpu;
|
||||
cpudata->epp_policy = 0;
|
||||
|
||||
- amd_pstate_init_prefcore(cpudata);
|
||||
-
|
||||
ret = amd_pstate_init_perf(cpudata);
|
||||
if (ret)
|
||||
goto free_cpudata1;
|
||||
|
||||
+ amd_pstate_init_prefcore(cpudata);
|
||||
+
|
||||
ret = amd_pstate_init_freq(cpudata);
|
||||
if (ret)
|
||||
goto free_cpudata1;
|
||||
@@ -1960,6 +1946,12 @@ static int __init amd_pstate_init(void)
|
||||
static_call_update(amd_pstate_update_perf, cppc_update_perf);
|
||||
}
|
||||
|
||||
+ if (amd_pstate_prefcore) {
|
||||
+ ret = amd_detect_prefcore(&amd_pstate_prefcore);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
/* enable amd pstate feature */
|
||||
ret = amd_pstate_enable(true);
|
||||
if (ret) {
|
||||
--- a/include/acpi/cppc_acpi.h
|
||||
+++ b/include/acpi/cppc_acpi.h
|
||||
@@ -163,6 +163,7 @@ extern int cppc_get_auto_sel_caps(int cp
|
||||
extern int cppc_set_auto_sel(int cpu, bool enable);
|
||||
extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
|
||||
extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
|
||||
+extern int amd_detect_prefcore(bool *detected);
|
||||
#else /* !CONFIG_ACPI_CPPC_LIB */
|
||||
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
|
||||
{
|
||||
@@ -244,6 +245,10 @@ static inline int amd_get_boost_ratio_nu
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
+static inline int amd_detect_prefcore(bool *detected)
|
||||
+{
|
||||
+ return -ENODEV;
|
||||
+}
|
||||
#endif /* !CONFIG_ACPI_CPPC_LIB */
|
||||
|
||||
#endif /* _CPPC_ACPI_H*/
|
@@ -0,0 +1,169 @@
|
||||
From 68d89574b86625f4bd7a784fe9bcc221dc290e4f Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:30:04 -0500
|
||||
Subject: cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into
|
||||
amd_get_boost_ratio_numerator()
|
||||
|
||||
The special case in amd_pstate_highest_perf_set() is the value used
|
||||
for calculating the boost numerator. Merge this into
|
||||
amd_get_boost_ratio_numerator() and then use that to calculate boost
|
||||
ratio.
|
||||
|
||||
This allows dropping more special casing of the highest perf value.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
---
|
||||
Documentation/admin-guide/pm/amd-pstate.rst | 3 +-
|
||||
arch/x86/kernel/acpi/cppc.c | 16 +++++++
|
||||
drivers/cpufreq/amd-pstate.c | 52 ++++-----------------
|
||||
3 files changed, 28 insertions(+), 43 deletions(-)
|
||||
|
||||
--- a/Documentation/admin-guide/pm/amd-pstate.rst
|
||||
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
|
||||
@@ -251,7 +251,8 @@ performance supported in `AMD CPPC Perfo
|
||||
In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
|
||||
table, so we need to expose it to sysfs. If boost is not active, but
|
||||
still supported, this maximum frequency will be larger than the one in
|
||||
-``cpuinfo``.
|
||||
+``cpuinfo``. On systems that support preferred core, the driver will have
|
||||
+different values for some cores than others.
|
||||
This attribute is read-only.
|
||||
|
||||
``amd_pstate_lowest_nonlinear_freq``
|
||||
--- a/arch/x86/kernel/acpi/cppc.c
|
||||
+++ b/arch/x86/kernel/acpi/cppc.c
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <asm/processor.h>
|
||||
#include <asm/topology.h>
|
||||
|
||||
+#define CPPC_HIGHEST_PERF_PERFORMANCE 196
|
||||
#define CPPC_HIGHEST_PERF_PREFCORE 166
|
||||
|
||||
enum amd_pref_core {
|
||||
@@ -245,6 +246,21 @@ int amd_get_boost_ratio_numerator(unsign
|
||||
*numerator = boost_numerator;
|
||||
return 0;
|
||||
}
|
||||
+
|
||||
+ /*
|
||||
+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
|
||||
+ * the highest performance level is set to 196.
|
||||
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759
|
||||
+ */
|
||||
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
|
||||
+ switch (boot_cpu_data.x86_model) {
|
||||
+ case 0x70 ... 0x7f:
|
||||
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
|
||||
+ return 0;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
*numerator = CPPC_HIGHEST_PERF_PREFCORE;
|
||||
|
||||
return 0;
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -52,8 +52,6 @@
|
||||
#define AMD_PSTATE_TRANSITION_LATENCY 20000
|
||||
#define AMD_PSTATE_TRANSITION_DELAY 1000
|
||||
#define AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY 600
|
||||
-#define CPPC_HIGHEST_PERF_PERFORMANCE 196
|
||||
-#define CPPC_HIGHEST_PERF_DEFAULT 166
|
||||
|
||||
#define AMD_CPPC_EPP_PERFORMANCE 0x00
|
||||
#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80
|
||||
@@ -398,43 +396,17 @@ static inline int amd_pstate_enable(bool
|
||||
return static_call(amd_pstate_enable)(enable);
|
||||
}
|
||||
|
||||
-static u32 amd_pstate_highest_perf_set(struct amd_cpudata *cpudata)
|
||||
-{
|
||||
- struct cpuinfo_x86 *c = &cpu_data(0);
|
||||
-
|
||||
- /*
|
||||
- * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
|
||||
- * the highest performance level is set to 196.
|
||||
- * https://bugzilla.kernel.org/show_bug.cgi?id=218759
|
||||
- */
|
||||
- if (c->x86 == 0x19 && (c->x86_model >= 0x70 && c->x86_model <= 0x7f))
|
||||
- return CPPC_HIGHEST_PERF_PERFORMANCE;
|
||||
-
|
||||
- return CPPC_HIGHEST_PERF_DEFAULT;
|
||||
-}
|
||||
-
|
||||
static int pstate_init_perf(struct amd_cpudata *cpudata)
|
||||
{
|
||||
u64 cap1;
|
||||
- u32 highest_perf;
|
||||
|
||||
int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
|
||||
&cap1);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
- /* For platforms that do not support the preferred core feature, the
|
||||
- * highest_pef may be configured with 166 or 255, to avoid max frequency
|
||||
- * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as
|
||||
- * the default max perf.
|
||||
- */
|
||||
- if (cpudata->hw_prefcore)
|
||||
- highest_perf = amd_pstate_highest_perf_set(cpudata);
|
||||
- else
|
||||
- highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
|
||||
-
|
||||
- WRITE_ONCE(cpudata->highest_perf, highest_perf);
|
||||
- WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
|
||||
+ WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
|
||||
+ WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1));
|
||||
WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
|
||||
WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
|
||||
WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
|
||||
@@ -446,19 +418,13 @@ static int pstate_init_perf(struct amd_c
|
||||
static int cppc_init_perf(struct amd_cpudata *cpudata)
|
||||
{
|
||||
struct cppc_perf_caps cppc_perf;
|
||||
- u32 highest_perf;
|
||||
|
||||
int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
- if (cpudata->hw_prefcore)
|
||||
- highest_perf = amd_pstate_highest_perf_set(cpudata);
|
||||
- else
|
||||
- highest_perf = cppc_perf.highest_perf;
|
||||
-
|
||||
- WRITE_ONCE(cpudata->highest_perf, highest_perf);
|
||||
- WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
|
||||
+ WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
|
||||
+ WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf);
|
||||
WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
|
||||
WRITE_ONCE(cpudata->lowest_nonlinear_perf,
|
||||
cppc_perf.lowest_nonlinear_perf);
|
||||
@@ -944,8 +910,8 @@ static u32 amd_pstate_get_transition_lat
|
||||
static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
|
||||
{
|
||||
int ret;
|
||||
- u32 min_freq;
|
||||
- u32 highest_perf, max_freq;
|
||||
+ u32 min_freq, max_freq;
|
||||
+ u64 numerator;
|
||||
u32 nominal_perf, nominal_freq;
|
||||
u32 lowest_nonlinear_perf, lowest_nonlinear_freq;
|
||||
u32 boost_ratio, lowest_nonlinear_ratio;
|
||||
@@ -967,8 +933,10 @@ static int amd_pstate_init_freq(struct a
|
||||
|
||||
nominal_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
|
||||
- highest_perf = READ_ONCE(cpudata->highest_perf);
|
||||
- boost_ratio = div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
|
||||
+ ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+ boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf);
|
||||
max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
|
||||
|
||||
lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
|
@@ -0,0 +1,42 @@
|
||||
From deed718125e73b6bf280dcebb80c39108226388c Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:30:05 -0500
|
||||
Subject: cpufreq: amd-pstate: Optimize amd_pstate_update_limits()
|
||||
|
||||
Don't take and release the mutex when prefcore isn't present and
|
||||
avoid initialization of variables that will be initially set
|
||||
in the function.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 10 +++++-----
|
||||
1 file changed, 5 insertions(+), 5 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -836,17 +836,17 @@ static void amd_pstate_update_limits(uns
|
||||
|
||||
cpudata = policy->driver_data;
|
||||
|
||||
- mutex_lock(&amd_pstate_driver_lock);
|
||||
- if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
|
||||
- goto free_cpufreq_put;
|
||||
+ if (!amd_pstate_prefcore)
|
||||
+ return;
|
||||
|
||||
+ mutex_lock(&amd_pstate_driver_lock);
|
||||
ret = amd_get_highest_perf(cpu, &cur_high);
|
||||
if (ret)
|
||||
goto free_cpufreq_put;
|
||||
|
||||
prev_high = READ_ONCE(cpudata->prefcore_ranking);
|
||||
- if (prev_high != cur_high) {
|
||||
- highest_perf_changed = true;
|
||||
+ highest_perf_changed = (prev_high != cur_high);
|
||||
+ if (highest_perf_changed) {
|
||||
WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
|
||||
|
||||
if (cur_high < CPPC_MAX_PERF)
|
@@ -0,0 +1,29 @@
|
||||
From 391075a34e392c7cacd338a6b034a21a10679855 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:30:06 -0500
|
||||
Subject: cpufreq: amd-pstate: Add documentation for `amd_pstate_hw_prefcore`
|
||||
|
||||
Explain that the sysfs file represents both preferred core being
|
||||
enabled by the user and supported by the hardware.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
---
|
||||
Documentation/admin-guide/pm/amd-pstate.rst | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
--- a/Documentation/admin-guide/pm/amd-pstate.rst
|
||||
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
|
||||
@@ -263,6 +263,11 @@ lowest non-linear performance in `AMD CP
|
||||
<perf_cap_>`_.)
|
||||
This attribute is read-only.
|
||||
|
||||
+``amd_pstate_hw_prefcore``
|
||||
+
|
||||
+Whether the platform supports the preferred core feature and it has been
|
||||
+enabled. This attribute is read-only.
|
||||
+
|
||||
``energy_performance_available_preferences``
|
||||
|
||||
A list of all the supported EPP preferences that could be used for
|
@@ -0,0 +1,42 @@
|
||||
From 2ed9874f6dcafcc2bee7a922af9e1d1c62dbeb18 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:30:07 -0500
|
||||
Subject: amd-pstate: Add missing documentation for
|
||||
`amd_pstate_prefcore_ranking`
|
||||
|
||||
`amd_pstate_prefcore_ranking` reflects the dynamic rankings of a CPU
|
||||
core based on platform conditions. Explicitly include it in the
|
||||
documentation.
|
||||
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
|
||||
---
|
||||
Documentation/admin-guide/pm/amd-pstate.rst | 9 ++++++++-
|
||||
1 file changed, 8 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/Documentation/admin-guide/pm/amd-pstate.rst
|
||||
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
|
||||
@@ -252,7 +252,8 @@ In some ASICs, the highest CPPC performa
|
||||
table, so we need to expose it to sysfs. If boost is not active, but
|
||||
still supported, this maximum frequency will be larger than the one in
|
||||
``cpuinfo``. On systems that support preferred core, the driver will have
|
||||
-different values for some cores than others.
|
||||
+different values for some cores than others and this will reflect the values
|
||||
+advertised by the platform at bootup.
|
||||
This attribute is read-only.
|
||||
|
||||
``amd_pstate_lowest_nonlinear_freq``
|
||||
@@ -268,6 +269,12 @@ This attribute is read-only.
|
||||
Whether the platform supports the preferred core feature and it has been
|
||||
enabled. This attribute is read-only.
|
||||
|
||||
+``amd_pstate_prefcore_ranking``
|
||||
+
|
||||
+The performance ranking of the core. This number doesn't have any unit, but
|
||||
+larger numbers are preferred at the time of reading. This can change at
|
||||
+runtime based on platform conditions. This attribute is read-only.
|
||||
+
|
||||
``energy_performance_available_preferences``
|
||||
|
||||
A list of all the supported EPP preferences that could be used for
|
24
debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch
vendored
Normal file
24
debian/patches/patchset-pf/amd-pstate/0016-cpufreq-amd-pstate-Fix-non-kerneldoc-comment.patch
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
From 2e2ba39aec71fb51e897c3275b255ef806800cf0 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 5 Sep 2024 11:23:51 -0500
|
||||
Subject: cpufreq/amd-pstate: Fix non kerneldoc comment
|
||||
|
||||
The comment for amd_cppc_supported() isn't meant to be kernel doc.
|
||||
|
||||
Fixes: cb817ec6673b7 ("cpufreq: amd-pstate: show CPPC debug message if CPPC is not supported")
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1786,7 +1786,7 @@ static int __init amd_pstate_set_driver(
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
-/**
|
||||
+/*
|
||||
* CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F.
|
||||
* show the debug message that helps to check if the CPU has CPPC support for loading issue.
|
||||
*/
|
@@ -0,0 +1,24 @@
|
||||
From 185e64a7e1a749593f3d6dadc666da9dda82d48c Mon Sep 17 00:00:00 2001
|
||||
From: Qianqiang Liu <qianqiang.liu@163.com>
|
||||
Date: Wed, 11 Sep 2024 07:39:24 +0800
|
||||
Subject: cpufreq/amd-pstate-ut: Fix an "Uninitialized variables" issue
|
||||
|
||||
Using uninitialized value "mode2" when calling "amd_pstate_get_mode_string".
|
||||
Set "mode2" to "AMD_PSTATE_DISABLE" by default.
|
||||
|
||||
Signed-off-by: Qianqiang Liu <qianqiang.liu@163.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -270,7 +270,7 @@ static int amd_pstate_set_mode(enum amd_
|
||||
|
||||
static void amd_pstate_ut_check_driver(u32 index)
|
||||
{
|
||||
- enum amd_pstate_mode mode1, mode2;
|
||||
+ enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE;
|
||||
int ret;
|
||||
|
||||
for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {
|
@@ -0,0 +1,108 @@
|
||||
From d74ce254cc470da670d6b90c69bab553cdbde62b Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Tue, 17 Sep 2024 09:14:35 +0000
|
||||
Subject: cpufreq/amd-pstate: Rename MSR and shared memory specific functions
|
||||
|
||||
Existing function names "cppc_*" and "pstate_*" for shared memory and
|
||||
MSR based systems are not intuitive enough, replace them with "shmem_*" and
|
||||
"msr_*" respectively.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 24 ++++++++++++------------
|
||||
1 file changed, 12 insertions(+), 12 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -263,7 +263,7 @@ static int amd_pstate_get_energy_pref_in
|
||||
return index;
|
||||
}
|
||||
|
||||
-static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
|
||||
+static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
|
||||
u32 des_perf, u32 max_perf, bool fast_switch)
|
||||
{
|
||||
if (fast_switch)
|
||||
@@ -273,7 +273,7 @@ static void pstate_update_perf(struct am
|
||||
READ_ONCE(cpudata->cppc_req_cached));
|
||||
}
|
||||
|
||||
-DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf);
|
||||
+DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf);
|
||||
|
||||
static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata,
|
||||
u32 min_perf, u32 des_perf,
|
||||
@@ -336,7 +336,7 @@ static int amd_pstate_set_energy_pref_in
|
||||
return ret;
|
||||
}
|
||||
|
||||
-static inline int pstate_enable(bool enable)
|
||||
+static inline int msr_enable(bool enable)
|
||||
{
|
||||
int ret, cpu;
|
||||
unsigned long logical_proc_id_mask = 0;
|
||||
@@ -362,7 +362,7 @@ static inline int pstate_enable(bool ena
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int cppc_enable(bool enable)
|
||||
+static int shmem_enable(bool enable)
|
||||
{
|
||||
int cpu, ret = 0;
|
||||
struct cppc_perf_ctrls perf_ctrls;
|
||||
@@ -389,14 +389,14 @@ static int cppc_enable(bool enable)
|
||||
return ret;
|
||||
}
|
||||
|
||||
-DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable);
|
||||
+DEFINE_STATIC_CALL(amd_pstate_enable, msr_enable);
|
||||
|
||||
static inline int amd_pstate_enable(bool enable)
|
||||
{
|
||||
return static_call(amd_pstate_enable)(enable);
|
||||
}
|
||||
|
||||
-static int pstate_init_perf(struct amd_cpudata *cpudata)
|
||||
+static int msr_init_perf(struct amd_cpudata *cpudata)
|
||||
{
|
||||
u64 cap1;
|
||||
|
||||
@@ -415,7 +415,7 @@ static int pstate_init_perf(struct amd_c
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int cppc_init_perf(struct amd_cpudata *cpudata)
|
||||
+static int shmem_init_perf(struct amd_cpudata *cpudata)
|
||||
{
|
||||
struct cppc_perf_caps cppc_perf;
|
||||
|
||||
@@ -450,14 +450,14 @@ static int cppc_init_perf(struct amd_cpu
|
||||
return ret;
|
||||
}
|
||||
|
||||
-DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
|
||||
+DEFINE_STATIC_CALL(amd_pstate_init_perf, msr_init_perf);
|
||||
|
||||
static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata)
|
||||
{
|
||||
return static_call(amd_pstate_init_perf)(cpudata);
|
||||
}
|
||||
|
||||
-static void cppc_update_perf(struct amd_cpudata *cpudata,
|
||||
+static void shmem_update_perf(struct amd_cpudata *cpudata,
|
||||
u32 min_perf, u32 des_perf,
|
||||
u32 max_perf, bool fast_switch)
|
||||
{
|
||||
@@ -1909,9 +1909,9 @@ static int __init amd_pstate_init(void)
|
||||
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
|
||||
} else {
|
||||
pr_debug("AMD CPPC shared memory based functionality is supported\n");
|
||||
- static_call_update(amd_pstate_enable, cppc_enable);
|
||||
- static_call_update(amd_pstate_init_perf, cppc_init_perf);
|
||||
- static_call_update(amd_pstate_update_perf, cppc_update_perf);
|
||||
+ static_call_update(amd_pstate_enable, shmem_enable);
|
||||
+ static_call_update(amd_pstate_init_perf, shmem_init_perf);
|
||||
+ static_call_update(amd_pstate_update_perf, shmem_update_perf);
|
||||
}
|
||||
|
||||
if (amd_pstate_prefcore) {
|
@@ -0,0 +1,115 @@
|
||||
From 787175146e26a199c06be4e6bf8cf8da0f757271 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Thu, 3 Oct 2024 08:39:52 +0000
|
||||
Subject: cpufreq: Add a callback to update the min_freq_req from drivers
|
||||
|
||||
Currently, there is no proper way to update the initial lower frequency
|
||||
limit from cpufreq drivers. Only way is to add a new min_freq qos
|
||||
request from the driver side, but it leads to the issue explained below.
|
||||
|
||||
The QoS infrastructure collates the constraints from multiple
|
||||
subsystems and saves them in a plist. The "current value" is defined to
|
||||
be the highest value in the plist for min_freq constraint.
|
||||
|
||||
The cpufreq core adds a qos_request for min_freq to be 0 and the amd-pstate
|
||||
driver today adds qos request for min_freq to be lowest_freq, where
|
||||
lowest_freq corresponds to CPPC.lowest_perf.
|
||||
|
||||
Eg: Suppose WLOG considering amd-pstate driver, lowest_freq is 400000 KHz,
|
||||
lowest_non_linear_freq is 1200000 KHz.
|
||||
|
||||
At this point of time, the min_freq QoS plist looks like:
|
||||
|
||||
head--> 400000 KHz (registered by amd-pstate) --> 0 KHz (registered by
|
||||
cpufreq core)
|
||||
|
||||
When a user updates /sys/devices/system/cpu/cpuX/cpufreq/scaling_min_freq,
|
||||
it only results in updating the cpufreq-core's node in the plist, where
|
||||
say 0 becomes the newly echoed value.
|
||||
|
||||
Now, if the user echoes a value 1000000 KHz, to scaling_min_freq, then the
|
||||
new list would be
|
||||
|
||||
head--> 1000000 KHz (registered by cpufreq core) --> 400000 KHz (registered
|
||||
by amd-pstate)
|
||||
|
||||
and the new "current value" of the min_freq QoS constraint will be 1000000
|
||||
KHz, this is the scenario where it works as expected.
|
||||
|
||||
Suppose we change the amd-pstate driver code's min_freq qos constraint
|
||||
to lowest_non_linear_freq instead of lowest_freq, then the user will
|
||||
never be able to request a value below that, due to the following:
|
||||
|
||||
At boot time, the min_freq QoS plist would be
|
||||
|
||||
head--> 1200000 KHz (registered by amd-pstate) --> 0 KHz (registered by
|
||||
cpufreq core)
|
||||
|
||||
When the user echoes a value of 1000000 KHz, to
|
||||
/sys/devices/..../scaling_min_freq, then the new list would be
|
||||
|
||||
head--> 1200000 KHz (registered by amd-pstate) --> 1000000 KHz (registered
|
||||
by cpufreq core)
|
||||
|
||||
with the new "current value" of the min_freq QoS remaining 1200000 KHz.
|
||||
Since the current value has not changed, there won't be any notifications
|
||||
sent to the subsystems which have added their QoS constraints. In
|
||||
particular, the amd-pstate driver will not get the notification, and thus,
|
||||
the user's request to lower the scaling_min_freq will be ineffective.
|
||||
|
||||
Hence, it is advisable to have a single source of truth for the min and
|
||||
max freq QoS constraints between the cpufreq and the cpufreq drivers.
|
||||
|
||||
So add a new callback get_init_min_freq() add in struct cpufreq_driver,
|
||||
which allows amd-pstate (or any other cpufreq driver) to override the
|
||||
default min_freq value being set in the policy->min_freq_req. Now
|
||||
scaling_min_freq can be modified by the user to any value (lower or
|
||||
higher than the init value) later on if desired.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/cpufreq.c | 6 +++++-
|
||||
include/linux/cpufreq.h | 6 ++++++
|
||||
2 files changed, 11 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/cpufreq/cpufreq.c
|
||||
+++ b/drivers/cpufreq/cpufreq.c
|
||||
@@ -1380,6 +1380,7 @@ static int cpufreq_online(unsigned int c
|
||||
bool new_policy;
|
||||
unsigned long flags;
|
||||
unsigned int j;
|
||||
+ u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE;
|
||||
int ret;
|
||||
|
||||
pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
|
||||
@@ -1464,9 +1465,12 @@ static int cpufreq_online(unsigned int c
|
||||
goto out_destroy_policy;
|
||||
}
|
||||
|
||||
+ if (cpufreq_driver->get_init_min_freq)
|
||||
+ init_min_freq = cpufreq_driver->get_init_min_freq(policy);
|
||||
+
|
||||
ret = freq_qos_add_request(&policy->constraints,
|
||||
policy->min_freq_req, FREQ_QOS_MIN,
|
||||
- FREQ_QOS_MIN_DEFAULT_VALUE);
|
||||
+ init_min_freq);
|
||||
if (ret < 0) {
|
||||
/*
|
||||
* So we don't call freq_qos_remove_request() for an
|
||||
--- a/include/linux/cpufreq.h
|
||||
+++ b/include/linux/cpufreq.h
|
||||
@@ -414,6 +414,12 @@ struct cpufreq_driver {
|
||||
* policy is properly initialized, but before the governor is started.
|
||||
*/
|
||||
void (*register_em)(struct cpufreq_policy *policy);
|
||||
+
|
||||
+ /*
|
||||
+ * Set by drivers that want to initialize the policy->min_freq_req with
|
||||
+ * a value different from the default value (0) in cpufreq core.
|
||||
+ */
|
||||
+ int (*get_init_min_freq)(struct cpufreq_policy *policy);
|
||||
};
|
||||
|
||||
/* flags */
|
@@ -0,0 +1,79 @@
|
||||
From f5b234be445a45b0bcacc37e0aad7a6bc7900eac Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Thu, 3 Oct 2024 08:39:54 +0000
|
||||
Subject: cpufreq/amd-pstate: Set the initial min_freq to lowest_nonlinear_freq
|
||||
|
||||
According to the AMD architectural programmer's manual volume 2 [1], in
|
||||
section "17.6.4.1 CPPC_CAPABILITY_1" lowest_nonlinear_perf is described
|
||||
as "Reports the most energy efficient performance level (in terms of
|
||||
performance per watt). Above this threshold, lower performance levels
|
||||
generally result in increased energy efficiency. Reducing performance
|
||||
below this threshold does not result in total energy savings for a given
|
||||
computation, although it reduces instantaneous power consumption". So
|
||||
lowest_nonlinear_perf is the most power efficient performance level, and
|
||||
going below that would lead to a worse performance/watt.
|
||||
|
||||
Also, setting the minimum frequency to lowest_nonlinear_freq (instead of
|
||||
lowest_freq) allows the CPU to idle at a higher frequency which leads
|
||||
to more time being spent in a deeper idle state (as trivial idle tasks
|
||||
are completed sooner). This has shown a power benefit in some systems,
|
||||
in other systems, power consumption has increased but so has the
|
||||
throughput/watt.
|
||||
|
||||
Use the get_init_min_freq() callback to set the initial lower limit for
|
||||
amd-pstate driver to lowest_nonlinear_freq instead of lowest_freq.
|
||||
|
||||
Link: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/24593.pdf [1]
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 16 +++++++++-------
|
||||
1 file changed, 9 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1025,13 +1025,6 @@ static int amd_pstate_cpu_init(struct cp
|
||||
if (cpu_feature_enabled(X86_FEATURE_CPPC))
|
||||
policy->fast_switch_possible = true;
|
||||
|
||||
- ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
|
||||
- FREQ_QOS_MIN, policy->cpuinfo.min_freq);
|
||||
- if (ret < 0) {
|
||||
- dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
|
||||
- goto free_cpudata1;
|
||||
- }
|
||||
-
|
||||
ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
|
||||
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
|
||||
if (ret < 0) {
|
||||
@@ -1736,6 +1729,13 @@ static int amd_pstate_epp_resume(struct
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy)
|
||||
+{
|
||||
+ struct amd_cpudata *cpudata = policy->driver_data;
|
||||
+
|
||||
+ return READ_ONCE(cpudata->lowest_nonlinear_freq);
|
||||
+}
|
||||
+
|
||||
static struct cpufreq_driver amd_pstate_driver = {
|
||||
.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
|
||||
.verify = amd_pstate_verify,
|
||||
@@ -1749,6 +1749,7 @@ static struct cpufreq_driver amd_pstate_
|
||||
.update_limits = amd_pstate_update_limits,
|
||||
.name = "amd-pstate",
|
||||
.attr = amd_pstate_attr,
|
||||
+ .get_init_min_freq = amd_pstate_get_init_min_freq,
|
||||
};
|
||||
|
||||
static struct cpufreq_driver amd_pstate_epp_driver = {
|
||||
@@ -1765,6 +1766,7 @@ static struct cpufreq_driver amd_pstate_
|
||||
.set_boost = amd_pstate_set_boost,
|
||||
.name = "amd-pstate-epp",
|
||||
.attr = amd_pstate_epp_attr,
|
||||
+ .get_init_min_freq = amd_pstate_get_init_min_freq,
|
||||
};
|
||||
|
||||
static int __init amd_pstate_set_driver(int mode_idx)
|
@@ -0,0 +1,103 @@
|
||||
From f7b2b3a1c0d015c4272793bed89734c5cffb354c Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Thu, 3 Oct 2024 08:39:56 +0000
|
||||
Subject: cpufreq/amd-pstate: Cleanup the old min_freq qos request remnants
|
||||
|
||||
Convert the freq_qos_request array in struct amd_cpudata to a single
|
||||
variable (only for max_freq request). Remove the references to cpudata->req
|
||||
array. Remove and rename the jump labels accordingly.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 19 ++++++++-----------
|
||||
drivers/cpufreq/amd-pstate.h | 4 ++--
|
||||
2 files changed, 10 insertions(+), 13 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -726,7 +726,7 @@ static int amd_pstate_cpu_boost_update(s
|
||||
policy->max = policy->cpuinfo.max_freq;
|
||||
|
||||
if (cppc_state == AMD_PSTATE_PASSIVE) {
|
||||
- ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq);
|
||||
+ ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq);
|
||||
if (ret < 0)
|
||||
pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
|
||||
}
|
||||
@@ -993,17 +993,17 @@ static int amd_pstate_cpu_init(struct cp
|
||||
|
||||
ret = amd_pstate_init_perf(cpudata);
|
||||
if (ret)
|
||||
- goto free_cpudata1;
|
||||
+ goto free_cpudata;
|
||||
|
||||
amd_pstate_init_prefcore(cpudata);
|
||||
|
||||
ret = amd_pstate_init_freq(cpudata);
|
||||
if (ret)
|
||||
- goto free_cpudata1;
|
||||
+ goto free_cpudata;
|
||||
|
||||
ret = amd_pstate_init_boost_support(cpudata);
|
||||
if (ret)
|
||||
- goto free_cpudata1;
|
||||
+ goto free_cpudata;
|
||||
|
||||
min_freq = READ_ONCE(cpudata->min_freq);
|
||||
max_freq = READ_ONCE(cpudata->max_freq);
|
||||
@@ -1025,11 +1025,11 @@ static int amd_pstate_cpu_init(struct cp
|
||||
if (cpu_feature_enabled(X86_FEATURE_CPPC))
|
||||
policy->fast_switch_possible = true;
|
||||
|
||||
- ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
|
||||
+ ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req,
|
||||
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
|
||||
if (ret < 0) {
|
||||
dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
|
||||
- goto free_cpudata2;
|
||||
+ goto free_cpudata;
|
||||
}
|
||||
|
||||
cpudata->max_limit_freq = max_freq;
|
||||
@@ -1042,9 +1042,7 @@ static int amd_pstate_cpu_init(struct cp
|
||||
|
||||
return 0;
|
||||
|
||||
-free_cpudata2:
|
||||
- freq_qos_remove_request(&cpudata->req[0]);
|
||||
-free_cpudata1:
|
||||
+free_cpudata:
|
||||
kfree(cpudata);
|
||||
return ret;
|
||||
}
|
||||
@@ -1053,8 +1051,7 @@ static void amd_pstate_cpu_exit(struct c
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
|
||||
- freq_qos_remove_request(&cpudata->req[1]);
|
||||
- freq_qos_remove_request(&cpudata->req[0]);
|
||||
+ freq_qos_remove_request(&cpudata->max_freq_req);
|
||||
policy->fast_switch_possible = false;
|
||||
kfree(cpudata);
|
||||
}
|
||||
--- a/drivers/cpufreq/amd-pstate.h
|
||||
+++ b/drivers/cpufreq/amd-pstate.h
|
||||
@@ -28,7 +28,7 @@ struct amd_aperf_mperf {
|
||||
/**
|
||||
* struct amd_cpudata - private CPU data for AMD P-State
|
||||
* @cpu: CPU number
|
||||
- * @req: constraint request to apply
|
||||
+ * @max_freq_req: maximum frequency constraint request to apply
|
||||
* @cppc_req_cached: cached performance request hints
|
||||
* @highest_perf: the maximum performance an individual processor may reach,
|
||||
* assuming ideal conditions
|
||||
@@ -68,7 +68,7 @@ struct amd_aperf_mperf {
|
||||
struct amd_cpudata {
|
||||
int cpu;
|
||||
|
||||
- struct freq_qos_request req[2];
|
||||
+ struct freq_qos_request max_freq_req;
|
||||
u64 cppc_req_cached;
|
||||
|
||||
u32 highest_perf;
|
@@ -0,0 +1,42 @@
|
||||
From d1216c052bedbf6d79e4b0261e2f09e17c66ffd3 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Fri, 4 Oct 2024 12:23:04 +0000
|
||||
Subject: cpufreq/amd-pstate: Fix amd_pstate mode switch on shared memory
|
||||
systems
|
||||
|
||||
While switching the driver mode between active and passive, Collaborative
|
||||
Processor Performance Control (CPPC) is disabled in
|
||||
amd_pstate_unregister_driver(). But, it is not enabled back while registering
|
||||
the new driver (passive or active). This leads to the new driver mode not
|
||||
working correctly, so enable it back in amd_pstate_register_driver().
|
||||
|
||||
Fixes: 3ca7bc818d8c ("cpufreq: amd-pstate: Add guided mode control support via sysfs")
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 10 ++++++++++
|
||||
1 file changed, 10 insertions(+)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1221,11 +1221,21 @@ static int amd_pstate_register_driver(in
|
||||
return -EINVAL;
|
||||
|
||||
cppc_state = mode;
|
||||
+
|
||||
+ ret = amd_pstate_enable(true);
|
||||
+ if (ret) {
|
||||
+ pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n",
|
||||
+ ret);
|
||||
+ amd_pstate_driver_cleanup();
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
ret = cpufreq_register_driver(current_pstate_driver);
|
||||
if (ret) {
|
||||
amd_pstate_driver_cleanup();
|
||||
return ret;
|
||||
}
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
@@ -0,0 +1,57 @@
|
||||
From c4fde0d177bdb33912f450914d84d6432391a8b5 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Sat, 12 Oct 2024 12:45:16 -0500
|
||||
Subject: cpufreq/amd-pstate: Use nominal perf for limits when boost is
|
||||
disabled
|
||||
|
||||
When boost has been disabled the limit for perf should be nominal perf not
|
||||
the highest perf. Using the latter to do calculations will lead to
|
||||
incorrect values that are still above nominal.
|
||||
|
||||
Fixes: ad4caad58d91 ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()")
|
||||
Reported-by: Peter Jung <ptr1337@cachyos.org>
|
||||
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219348
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 20 ++++++++++++++------
|
||||
1 file changed, 14 insertions(+), 6 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -566,11 +566,16 @@ static int amd_pstate_verify(struct cpuf
|
||||
|
||||
static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
|
||||
{
|
||||
- u32 max_limit_perf, min_limit_perf, lowest_perf;
|
||||
+ u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf;
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
|
||||
- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
|
||||
- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
|
||||
+ if (cpudata->boost_supported && !policy->boost_enabled)
|
||||
+ max_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
+ else
|
||||
+ max_perf = READ_ONCE(cpudata->highest_perf);
|
||||
+
|
||||
+ max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
|
||||
+ min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
|
||||
|
||||
lowest_perf = READ_ONCE(cpudata->lowest_perf);
|
||||
if (min_limit_perf < lowest_perf)
|
||||
@@ -1526,10 +1531,13 @@ static int amd_pstate_epp_update_limit(s
|
||||
u64 value;
|
||||
s16 epp;
|
||||
|
||||
- max_perf = READ_ONCE(cpudata->highest_perf);
|
||||
+ if (cpudata->boost_supported && !policy->boost_enabled)
|
||||
+ max_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
+ else
|
||||
+ max_perf = READ_ONCE(cpudata->highest_perf);
|
||||
min_perf = READ_ONCE(cpudata->lowest_perf);
|
||||
- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
|
||||
- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
|
||||
+ max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
|
||||
+ min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
|
||||
|
||||
if (min_limit_perf < min_perf)
|
||||
min_limit_perf = min_perf;
|
@@ -0,0 +1,55 @@
|
||||
From 01ad0fb3da95867947d923596a26b18d844afe3c Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Sat, 12 Oct 2024 12:45:17 -0500
|
||||
Subject: cpufreq/amd-pstate: Don't update CPPC request in
|
||||
amd_pstate_cpu_boost_update()
|
||||
|
||||
When boost is changed the CPPC value is changed in amd_pstate_cpu_boost_update()
|
||||
but then changed again when refresh_frequency_limits() and all it's callbacks
|
||||
occur. The first is a pointless write, so instead just update the limits for
|
||||
the policy and let the policy refresh anchor everything properly.
|
||||
|
||||
Fixes: c8c68c38b56f ("cpufreq: amd-pstate: initialize core precision boost state")
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 24 +-----------------------
|
||||
1 file changed, 1 insertion(+), 23 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -695,34 +695,12 @@ static void amd_pstate_adjust_perf(unsig
|
||||
static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- struct cppc_perf_ctrls perf_ctrls;
|
||||
- u32 highest_perf, nominal_perf, nominal_freq, max_freq;
|
||||
+ u32 nominal_freq, max_freq;
|
||||
int ret = 0;
|
||||
|
||||
- highest_perf = READ_ONCE(cpudata->highest_perf);
|
||||
- nominal_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
nominal_freq = READ_ONCE(cpudata->nominal_freq);
|
||||
max_freq = READ_ONCE(cpudata->max_freq);
|
||||
|
||||
- if (boot_cpu_has(X86_FEATURE_CPPC)) {
|
||||
- u64 value = READ_ONCE(cpudata->cppc_req_cached);
|
||||
-
|
||||
- value &= ~GENMASK_ULL(7, 0);
|
||||
- value |= on ? highest_perf : nominal_perf;
|
||||
- WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
-
|
||||
- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
|
||||
- } else {
|
||||
- perf_ctrls.max_perf = on ? highest_perf : nominal_perf;
|
||||
- ret = cppc_set_perf(cpudata->cpu, &perf_ctrls);
|
||||
- if (ret) {
|
||||
- cpufreq_cpu_release(policy);
|
||||
- pr_debug("Failed to set max perf on CPU:%d. ret:%d\n",
|
||||
- cpudata->cpu, ret);
|
||||
- return ret;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
if (on)
|
||||
policy->cpuinfo.max_freq = max_freq;
|
||||
else if (policy->cpuinfo.max_freq > nominal_freq * 1000)
|
@@ -0,0 +1,49 @@
|
||||
From 684d162c08ab86fff02861c907ecc92bf9c09af4 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Sat, 12 Oct 2024 12:45:18 -0500
|
||||
Subject: cpufreq/amd-pstate: Use amd_pstate_update_min_max_limit() for EPP
|
||||
limits
|
||||
|
||||
When the EPP updates are set the maximum capable frequency for the
|
||||
CPU is used to set the upper limit instead of that of the policy.
|
||||
|
||||
Adjust amd_pstate_epp_update_limit() to reuse policy calculation code
|
||||
from amd_pstate_update_min_max_limit().
|
||||
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 19 +++----------------
|
||||
1 file changed, 3 insertions(+), 16 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1505,26 +1505,13 @@ static void amd_pstate_epp_cpu_exit(stru
|
||||
static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
- u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
|
||||
+ u32 max_perf, min_perf;
|
||||
u64 value;
|
||||
s16 epp;
|
||||
|
||||
- if (cpudata->boost_supported && !policy->boost_enabled)
|
||||
- max_perf = READ_ONCE(cpudata->nominal_perf);
|
||||
- else
|
||||
- max_perf = READ_ONCE(cpudata->highest_perf);
|
||||
+ max_perf = READ_ONCE(cpudata->highest_perf);
|
||||
min_perf = READ_ONCE(cpudata->lowest_perf);
|
||||
- max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
|
||||
- min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
|
||||
-
|
||||
- if (min_limit_perf < min_perf)
|
||||
- min_limit_perf = min_perf;
|
||||
-
|
||||
- if (max_limit_perf < min_limit_perf)
|
||||
- max_limit_perf = min_limit_perf;
|
||||
-
|
||||
- WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
|
||||
- WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
|
||||
+ amd_pstate_update_min_max_limit(policy);
|
||||
|
||||
max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
|
||||
cpudata->max_limit_perf);
|
@@ -0,0 +1,29 @@
|
||||
From fa46d2873c9fa4060ce407e4bc5c7e29babce9d0 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Sat, 12 Oct 2024 12:45:19 -0500
|
||||
Subject: cpufreq/amd-pstate: Drop needless EPP initialization
|
||||
|
||||
The EPP value doesn't need to be cached to the CPPC request in
|
||||
amd_pstate_epp_update_limit() because it's passed as an argument
|
||||
at the end to amd_pstate_set_epp() and stored at that time.
|
||||
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 6 ------
|
||||
1 file changed, 6 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1548,12 +1548,6 @@ static int amd_pstate_epp_update_limit(s
|
||||
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
epp = 0;
|
||||
|
||||
- /* Set initial EPP value */
|
||||
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
|
||||
- value &= ~GENMASK_ULL(31, 24);
|
||||
- value |= (u64)epp << 24;
|
||||
- }
|
||||
-
|
||||
WRITE_ONCE(cpudata->cppc_req_cached, value);
|
||||
return amd_pstate_set_epp(cpudata, epp);
|
||||
}
|
@@ -0,0 +1,228 @@
|
||||
From 649d296be0c7f0df6e71b4fca25fdbe75cb3994e Mon Sep 17 00:00:00 2001
|
||||
From: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Date: Thu, 17 Oct 2024 17:03:11 +0200
|
||||
Subject: amd-pstate-6.11: update setting the minimum frequency to
|
||||
lowest_nonlinear_freq patchset to v3
|
||||
|
||||
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 67 +++++++++++++++++++++---------------
|
||||
drivers/cpufreq/amd-pstate.h | 4 +--
|
||||
drivers/cpufreq/cpufreq.c | 6 +---
|
||||
include/linux/cpufreq.h | 6 ----
|
||||
4 files changed, 43 insertions(+), 40 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -557,9 +557,28 @@ cpufreq_policy_put:
|
||||
cpufreq_cpu_put(policy);
|
||||
}
|
||||
|
||||
-static int amd_pstate_verify(struct cpufreq_policy_data *policy)
|
||||
+static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
|
||||
{
|
||||
- cpufreq_verify_within_cpu_limits(policy);
|
||||
+ /*
|
||||
+ * Initialize lower frequency limit (i.e.policy->min) with
|
||||
+ * lowest_nonlinear_frequency which is the most energy efficient
|
||||
+ * frequency. Override the initial value set by cpufreq core and
|
||||
+ * amd-pstate qos_requests.
|
||||
+ */
|
||||
+ if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
|
||||
+ struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu);
|
||||
+ struct amd_cpudata *cpudata;
|
||||
+
|
||||
+ if (!policy)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ cpudata = policy->driver_data;
|
||||
+ policy_data->min = cpudata->lowest_nonlinear_freq;
|
||||
+ cpufreq_cpu_put(policy);
|
||||
+ }
|
||||
+
|
||||
+ cpufreq_verify_within_cpu_limits(policy_data);
|
||||
+ pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -709,7 +728,7 @@ static int amd_pstate_cpu_boost_update(s
|
||||
policy->max = policy->cpuinfo.max_freq;
|
||||
|
||||
if (cppc_state == AMD_PSTATE_PASSIVE) {
|
||||
- ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq);
|
||||
+ ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq);
|
||||
if (ret < 0)
|
||||
pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
|
||||
}
|
||||
@@ -976,17 +995,17 @@ static int amd_pstate_cpu_init(struct cp
|
||||
|
||||
ret = amd_pstate_init_perf(cpudata);
|
||||
if (ret)
|
||||
- goto free_cpudata;
|
||||
+ goto free_cpudata1;
|
||||
|
||||
amd_pstate_init_prefcore(cpudata);
|
||||
|
||||
ret = amd_pstate_init_freq(cpudata);
|
||||
if (ret)
|
||||
- goto free_cpudata;
|
||||
+ goto free_cpudata1;
|
||||
|
||||
ret = amd_pstate_init_boost_support(cpudata);
|
||||
if (ret)
|
||||
- goto free_cpudata;
|
||||
+ goto free_cpudata1;
|
||||
|
||||
min_freq = READ_ONCE(cpudata->min_freq);
|
||||
max_freq = READ_ONCE(cpudata->max_freq);
|
||||
@@ -1008,11 +1027,18 @@ static int amd_pstate_cpu_init(struct cp
|
||||
if (cpu_feature_enabled(X86_FEATURE_CPPC))
|
||||
policy->fast_switch_possible = true;
|
||||
|
||||
- ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req,
|
||||
+ ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
|
||||
+ FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE);
|
||||
+ if (ret < 0) {
|
||||
+ dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
|
||||
+ goto free_cpudata1;
|
||||
+ }
|
||||
+
|
||||
+ ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
|
||||
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
|
||||
if (ret < 0) {
|
||||
dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
|
||||
- goto free_cpudata;
|
||||
+ goto free_cpudata2;
|
||||
}
|
||||
|
||||
cpudata->max_limit_freq = max_freq;
|
||||
@@ -1025,7 +1051,9 @@ static int amd_pstate_cpu_init(struct cp
|
||||
|
||||
return 0;
|
||||
|
||||
-free_cpudata:
|
||||
+free_cpudata2:
|
||||
+ freq_qos_remove_request(&cpudata->req[0]);
|
||||
+free_cpudata1:
|
||||
kfree(cpudata);
|
||||
return ret;
|
||||
}
|
||||
@@ -1034,7 +1062,8 @@ static void amd_pstate_cpu_exit(struct c
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
|
||||
- freq_qos_remove_request(&cpudata->max_freq_req);
|
||||
+ freq_qos_remove_request(&cpudata->req[1]);
|
||||
+ freq_qos_remove_request(&cpudata->req[0]);
|
||||
policy->fast_switch_possible = false;
|
||||
kfree(cpudata);
|
||||
}
|
||||
@@ -1658,13 +1687,6 @@ static int amd_pstate_epp_cpu_offline(st
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
|
||||
-{
|
||||
- cpufreq_verify_within_cpu_limits(policy);
|
||||
- pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
|
||||
{
|
||||
struct amd_cpudata *cpudata = policy->driver_data;
|
||||
@@ -1703,13 +1725,6 @@ static int amd_pstate_epp_resume(struct
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy)
|
||||
-{
|
||||
- struct amd_cpudata *cpudata = policy->driver_data;
|
||||
-
|
||||
- return READ_ONCE(cpudata->lowest_nonlinear_freq);
|
||||
-}
|
||||
-
|
||||
static struct cpufreq_driver amd_pstate_driver = {
|
||||
.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
|
||||
.verify = amd_pstate_verify,
|
||||
@@ -1723,12 +1738,11 @@ static struct cpufreq_driver amd_pstate_
|
||||
.update_limits = amd_pstate_update_limits,
|
||||
.name = "amd-pstate",
|
||||
.attr = amd_pstate_attr,
|
||||
- .get_init_min_freq = amd_pstate_get_init_min_freq,
|
||||
};
|
||||
|
||||
static struct cpufreq_driver amd_pstate_epp_driver = {
|
||||
.flags = CPUFREQ_CONST_LOOPS,
|
||||
- .verify = amd_pstate_epp_verify_policy,
|
||||
+ .verify = amd_pstate_verify,
|
||||
.setpolicy = amd_pstate_epp_set_policy,
|
||||
.init = amd_pstate_epp_cpu_init,
|
||||
.exit = amd_pstate_epp_cpu_exit,
|
||||
@@ -1740,7 +1754,6 @@ static struct cpufreq_driver amd_pstate_
|
||||
.set_boost = amd_pstate_set_boost,
|
||||
.name = "amd-pstate-epp",
|
||||
.attr = amd_pstate_epp_attr,
|
||||
- .get_init_min_freq = amd_pstate_get_init_min_freq,
|
||||
};
|
||||
|
||||
static int __init amd_pstate_set_driver(int mode_idx)
|
||||
--- a/drivers/cpufreq/amd-pstate.h
|
||||
+++ b/drivers/cpufreq/amd-pstate.h
|
||||
@@ -28,7 +28,7 @@ struct amd_aperf_mperf {
|
||||
/**
|
||||
* struct amd_cpudata - private CPU data for AMD P-State
|
||||
* @cpu: CPU number
|
||||
- * @max_freq_req: maximum frequency constraint request to apply
|
||||
+ * @req: constraint request to apply
|
||||
* @cppc_req_cached: cached performance request hints
|
||||
* @highest_perf: the maximum performance an individual processor may reach,
|
||||
* assuming ideal conditions
|
||||
@@ -68,7 +68,7 @@ struct amd_aperf_mperf {
|
||||
struct amd_cpudata {
|
||||
int cpu;
|
||||
|
||||
- struct freq_qos_request max_freq_req;
|
||||
+ struct freq_qos_request req[2];
|
||||
u64 cppc_req_cached;
|
||||
|
||||
u32 highest_perf;
|
||||
--- a/drivers/cpufreq/cpufreq.c
|
||||
+++ b/drivers/cpufreq/cpufreq.c
|
||||
@@ -1380,7 +1380,6 @@ static int cpufreq_online(unsigned int c
|
||||
bool new_policy;
|
||||
unsigned long flags;
|
||||
unsigned int j;
|
||||
- u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE;
|
||||
int ret;
|
||||
|
||||
pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
|
||||
@@ -1465,12 +1464,9 @@ static int cpufreq_online(unsigned int c
|
||||
goto out_destroy_policy;
|
||||
}
|
||||
|
||||
- if (cpufreq_driver->get_init_min_freq)
|
||||
- init_min_freq = cpufreq_driver->get_init_min_freq(policy);
|
||||
-
|
||||
ret = freq_qos_add_request(&policy->constraints,
|
||||
policy->min_freq_req, FREQ_QOS_MIN,
|
||||
- init_min_freq);
|
||||
+ FREQ_QOS_MIN_DEFAULT_VALUE);
|
||||
if (ret < 0) {
|
||||
/*
|
||||
* So we don't call freq_qos_remove_request() for an
|
||||
--- a/include/linux/cpufreq.h
|
||||
+++ b/include/linux/cpufreq.h
|
||||
@@ -414,12 +414,6 @@ struct cpufreq_driver {
|
||||
* policy is properly initialized, but before the governor is started.
|
||||
*/
|
||||
void (*register_em)(struct cpufreq_policy *policy);
|
||||
-
|
||||
- /*
|
||||
- * Set by drivers that want to initialize the policy->min_freq_req with
|
||||
- * a value different from the default value (0) in cpufreq core.
|
||||
- */
|
||||
- int (*get_init_min_freq)(struct cpufreq_policy *policy);
|
||||
};
|
||||
|
||||
/* flags */
|
@@ -0,0 +1,44 @@
|
||||
From db147a0a6341822a15fd9c4cd51f8dc4a9a1747b Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Thu, 17 Oct 2024 10:05:27 +0000
|
||||
Subject: cpufreq/amd-pstate: Call amd_pstate_register() in amd_pstate_init()
|
||||
|
||||
Replace a similar chunk of code in amd_pstate_init() with
|
||||
amd_pstate_register() call.
|
||||
|
||||
Suggested-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 12 ++----------
|
||||
1 file changed, 2 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1909,17 +1909,10 @@ static int __init amd_pstate_init(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
- /* enable amd pstate feature */
|
||||
- ret = amd_pstate_enable(true);
|
||||
- if (ret) {
|
||||
- pr_err("failed to enable driver mode(%d)\n", cppc_state);
|
||||
- return ret;
|
||||
- }
|
||||
-
|
||||
- ret = cpufreq_register_driver(current_pstate_driver);
|
||||
+ ret = amd_pstate_register_driver(cppc_state);
|
||||
if (ret) {
|
||||
pr_err("failed to register with return %d\n", ret);
|
||||
- goto disable_driver;
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
dev_root = bus_get_dev_root(&cpu_subsys);
|
||||
@@ -1936,7 +1929,6 @@ static int __init amd_pstate_init(void)
|
||||
|
||||
global_attr_free:
|
||||
cpufreq_unregister_driver(current_pstate_driver);
|
||||
-disable_driver:
|
||||
amd_pstate_enable(false);
|
||||
return ret;
|
||||
}
|
@@ -0,0 +1,81 @@
|
||||
From 7c658490b05f6ab4dd59e1c25e75ba1037f6cfeb Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Thu, 17 Oct 2024 10:05:29 +0000
|
||||
Subject: cpufreq/amd-pstate: Call amd_pstate_set_driver() in
|
||||
amd_pstate_register_driver()
|
||||
|
||||
Replace a similar chunk of code in amd_pstate_register_driver() with
|
||||
amd_pstate_set_driver() call.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 47 +++++++++++++++++-------------------
|
||||
1 file changed, 22 insertions(+), 25 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1221,16 +1221,32 @@ static void amd_pstate_driver_cleanup(vo
|
||||
current_pstate_driver = NULL;
|
||||
}
|
||||
|
||||
+static int amd_pstate_set_driver(int mode_idx)
|
||||
+{
|
||||
+ if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
|
||||
+ cppc_state = mode_idx;
|
||||
+ if (cppc_state == AMD_PSTATE_DISABLE)
|
||||
+ pr_info("driver is explicitly disabled\n");
|
||||
+
|
||||
+ if (cppc_state == AMD_PSTATE_ACTIVE)
|
||||
+ current_pstate_driver = &amd_pstate_epp_driver;
|
||||
+
|
||||
+ if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
|
||||
+ current_pstate_driver = &amd_pstate_driver;
|
||||
+
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ return -EINVAL;
|
||||
+}
|
||||
+
|
||||
static int amd_pstate_register_driver(int mode)
|
||||
{
|
||||
int ret;
|
||||
|
||||
- if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED)
|
||||
- current_pstate_driver = &amd_pstate_driver;
|
||||
- else if (mode == AMD_PSTATE_ACTIVE)
|
||||
- current_pstate_driver = &amd_pstate_epp_driver;
|
||||
- else
|
||||
- return -EINVAL;
|
||||
+ ret = amd_pstate_set_driver(mode);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
|
||||
cppc_state = mode;
|
||||
|
||||
@@ -1756,25 +1772,6 @@ static struct cpufreq_driver amd_pstate_
|
||||
.attr = amd_pstate_epp_attr,
|
||||
};
|
||||
|
||||
-static int __init amd_pstate_set_driver(int mode_idx)
|
||||
-{
|
||||
- if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
|
||||
- cppc_state = mode_idx;
|
||||
- if (cppc_state == AMD_PSTATE_DISABLE)
|
||||
- pr_info("driver is explicitly disabled\n");
|
||||
-
|
||||
- if (cppc_state == AMD_PSTATE_ACTIVE)
|
||||
- current_pstate_driver = &amd_pstate_epp_driver;
|
||||
-
|
||||
- if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
|
||||
- current_pstate_driver = &amd_pstate_driver;
|
||||
-
|
||||
- return 0;
|
||||
- }
|
||||
-
|
||||
- return -EINVAL;
|
||||
-}
|
||||
-
|
||||
/*
|
||||
* CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F.
|
||||
* show the debug message that helps to check if the CPU has CPPC support for loading issue.
|
@@ -0,0 +1,41 @@
|
||||
From 55be5db97f4f52badc958463ee8d9cbc2ae91615 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Thu, 17 Oct 2024 10:05:31 +0000
|
||||
Subject: cpufreq/amd-pstate: Remove the switch case in amd_pstate_init()
|
||||
|
||||
Replace the switch case with a more readable if condition.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 16 +++++-----------
|
||||
1 file changed, 5 insertions(+), 11 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1873,21 +1873,15 @@ static int __init amd_pstate_init(void)
|
||||
cppc_state = CONFIG_X86_AMD_PSTATE_DEFAULT_MODE;
|
||||
}
|
||||
|
||||
- switch (cppc_state) {
|
||||
- case AMD_PSTATE_DISABLE:
|
||||
+ if (cppc_state == AMD_PSTATE_DISABLE) {
|
||||
pr_info("driver load is disabled, boot with specific mode to enable this\n");
|
||||
return -ENODEV;
|
||||
- case AMD_PSTATE_PASSIVE:
|
||||
- case AMD_PSTATE_ACTIVE:
|
||||
- case AMD_PSTATE_GUIDED:
|
||||
- ret = amd_pstate_set_driver(cppc_state);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
- break;
|
||||
- default:
|
||||
- return -EINVAL;
|
||||
}
|
||||
|
||||
+ ret = amd_pstate_set_driver(cppc_state);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
/* capability check */
|
||||
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
|
||||
pr_debug("AMD CPPC MSR based functionality is supported\n");
|
@@ -0,0 +1,43 @@
|
||||
From 7305364888151cb9e6b435c5f219ccfd18132b58 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Thu, 17 Oct 2024 10:05:33 +0000
|
||||
Subject: cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call
|
||||
|
||||
amd_pstate_set_driver() is called twice, once in amd_pstate_init() and once
|
||||
as part of amd_pstate_register_driver(). Move around code and eliminate
|
||||
the redundancy.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 12 ++++--------
|
||||
1 file changed, 4 insertions(+), 8 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1878,9 +1878,11 @@ static int __init amd_pstate_init(void)
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
- ret = amd_pstate_set_driver(cppc_state);
|
||||
- if (ret)
|
||||
+ ret = amd_pstate_register_driver(cppc_state);
|
||||
+ if (ret) {
|
||||
+ pr_err("failed to register with return %d\n", ret);
|
||||
return ret;
|
||||
+ }
|
||||
|
||||
/* capability check */
|
||||
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
|
||||
@@ -1900,12 +1902,6 @@ static int __init amd_pstate_init(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
- ret = amd_pstate_register_driver(cppc_state);
|
||||
- if (ret) {
|
||||
- pr_err("failed to register with return %d\n", ret);
|
||||
- return ret;
|
||||
- }
|
||||
-
|
||||
dev_root = bus_get_dev_root(&cpu_subsys);
|
||||
if (dev_root) {
|
||||
ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group);
|
@@ -0,0 +1,33 @@
|
||||
From 5886ef269d069c72ea952cb00699e16221289e8c Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Thu, 17 Oct 2024 12:34:39 -0500
|
||||
Subject: cpufreq/amd-pstate-ut: Add fix for min freq unit test
|
||||
|
||||
commit 642aff3964b0f ("cpufreq/amd-pstate: Set the initial min_freq to
|
||||
lowest_nonlinear_freq") changed the iniital minimum frequency to lowest
|
||||
nonlinear frequency, but the unit tests weren't updated and now fail.
|
||||
|
||||
Update them to match this same change.
|
||||
|
||||
Fixes: 642aff3964b0f ("cpufreq/amd-pstate: Set the initial min_freq to lowest_nonlinear_freq")
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate-ut.c | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate-ut.c
|
||||
+++ b/drivers/cpufreq/amd-pstate-ut.c
|
||||
@@ -227,10 +227,10 @@ static void amd_pstate_ut_check_freq(u32
|
||||
goto skip_test;
|
||||
}
|
||||
|
||||
- if (cpudata->min_freq != policy->min) {
|
||||
+ if (cpudata->lowest_nonlinear_freq != policy->min) {
|
||||
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
|
||||
- pr_err("%s cpu%d cpudata_min_freq=%d policy_min=%d, they should be equal!\n",
|
||||
- __func__, cpu, cpudata->min_freq, policy->min);
|
||||
+ pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n",
|
||||
+ __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min);
|
||||
goto skip_test;
|
||||
}
|
||||
|
@@ -0,0 +1,33 @@
|
||||
From e82b9b5a56bcac18cae68878fe67263279805735 Mon Sep 17 00:00:00 2001
|
||||
From: "Gautham R. Shenoy" <gautham.shenoy@amd.com>
|
||||
Date: Mon, 21 Oct 2024 15:48:35 +0530
|
||||
Subject: amd-pstate: Set min_perf to nominal_perf for active mode performance
|
||||
gov
|
||||
|
||||
The amd-pstate driver sets CPPC_REQ.min_perf to CPPC_REQ.max_perf when
|
||||
in active mode with performance governor. Typically CPPC_REQ.max_perf
|
||||
is set to CPPC.highest_perf. This causes frequency throttling on
|
||||
power-limited platforms which causes performance regressions on
|
||||
certain classes of workloads.
|
||||
|
||||
Hence, set the CPPC_REQ.min_perf to the CPPC.nominal_perf or
|
||||
CPPC_REQ.max_perf, whichever is lower of the two.
|
||||
|
||||
Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors")
|
||||
Signed-off-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1565,7 +1565,7 @@ static int amd_pstate_epp_update_limit(s
|
||||
value = READ_ONCE(cpudata->cppc_req_cached);
|
||||
|
||||
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
|
||||
- min_perf = max_perf;
|
||||
+ min_perf = min(cpudata->nominal_perf, max_perf);
|
||||
|
||||
/* Initial min/max values for CPPC Performance Controls Register */
|
||||
value &= ~AMD_CPPC_MIN_PERF(~0L);
|
@@ -0,0 +1,44 @@
|
||||
From 497447cf96a785a4edd0756da5d5718037f5687c Mon Sep 17 00:00:00 2001
|
||||
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
|
||||
Date: Mon, 21 Oct 2024 15:48:36 +0530
|
||||
Subject: amd-pstate: Switch to amd-pstate by default on some Server platforms
|
||||
|
||||
Currently the default cpufreq driver for all the AMD EPYC servers is
|
||||
acpi-cpufreq. Going forward, switch to amd-pstate as the default
|
||||
driver on the AMD EPYC server platforms with CPU family 0x1A or
|
||||
higher. The default mode will be active mode.
|
||||
|
||||
Testing shows that amd-pstate with active mode and performance
|
||||
governor provides comparable or better performance per-watt against
|
||||
acpi-cpufreq + performance governor.
|
||||
|
||||
Likewise, amd-pstate with active mode and powersave governor with the
|
||||
energy_performance_preference=power (EPP=255) provides comparable or
|
||||
better performance per-watt against acpi-cpufreq + schedutil governor
|
||||
for a wide range of workloads.
|
||||
|
||||
Users can still revert to using acpi-cpufreq driver on these platforms
|
||||
with the "amd_pstate=disable" kernel commandline parameter.
|
||||
|
||||
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
|
||||
Signed-off-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
|
||||
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1862,10 +1862,10 @@ static int __init amd_pstate_init(void)
|
||||
if (cppc_state == AMD_PSTATE_UNDEFINED) {
|
||||
/* Disable on the following configs by default:
|
||||
* 1. Undefined platforms
|
||||
- * 2. Server platforms
|
||||
+ * 2. Server platforms with CPUs older than Family 0x1A.
|
||||
*/
|
||||
if (amd_pstate_acpi_pm_profile_undefined() ||
|
||||
- amd_pstate_acpi_pm_profile_server()) {
|
||||
+ (amd_pstate_acpi_pm_profile_server() && boot_cpu_data.x86 < 0x1A)) {
|
||||
pr_info("driver load is disabled, boot with specific mode to enable this\n");
|
||||
return -ENODEV;
|
||||
}
|
@@ -0,0 +1,38 @@
|
||||
From a4d255935a1ea6e4b10167df942ec641079bcdf7 Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Mon, 28 Oct 2024 09:55:41 -0500
|
||||
Subject: cpufreq/amd-pstate: Push adjust_perf vfunc init into cpu_init
|
||||
|
||||
As the driver can be changed in and out of different modes it's possible
|
||||
that adjust_perf is assigned when it shouldn't be.
|
||||
|
||||
This could happen if an MSR design is started up in passive mode and then
|
||||
switches to active mode.
|
||||
|
||||
To solve this explicitly clear `adjust_perf` in amd_pstate_epp_cpu_init().
|
||||
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1528,6 +1528,8 @@ static int amd_pstate_epp_cpu_init(struc
|
||||
WRITE_ONCE(cpudata->cppc_cap1_cached, value);
|
||||
}
|
||||
|
||||
+ current_pstate_driver->adjust_perf = NULL;
|
||||
+
|
||||
return 0;
|
||||
|
||||
free_cpudata1:
|
||||
@@ -1887,8 +1889,6 @@ static int __init amd_pstate_init(void)
|
||||
/* capability check */
|
||||
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
|
||||
pr_debug("AMD CPPC MSR based functionality is supported\n");
|
||||
- if (cppc_state != AMD_PSTATE_ACTIVE)
|
||||
- current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
|
||||
} else {
|
||||
pr_debug("AMD CPPC shared memory based functionality is supported\n");
|
||||
static_call_update(amd_pstate_enable, shmem_enable);
|
@@ -0,0 +1,47 @@
|
||||
From c42a82a583646dcbba8500d47ed878616ab5c33a Mon Sep 17 00:00:00 2001
|
||||
From: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Date: Mon, 28 Oct 2024 09:55:42 -0500
|
||||
Subject: cpufreq/amd-pstate: Move registration after static function call
|
||||
update
|
||||
|
||||
On shared memory designs the static functions need to work before
|
||||
registration is done or the system can hang at bootup.
|
||||
|
||||
Move the registration later in amd_pstate_init() to solve this.
|
||||
|
||||
Fixes: e238968a2087 ("cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call")
|
||||
Reported-by: Klara Modin <klarasmodin@gmail.com>
|
||||
Closes: https://lore.kernel.org/linux-pm/cf9c146d-bacf-444e-92e2-15ebf513af96@gmail.com/#t
|
||||
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
|
||||
---
|
||||
drivers/cpufreq/amd-pstate.c | 12 ++++++------
|
||||
1 file changed, 6 insertions(+), 6 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/amd-pstate.c
|
||||
+++ b/drivers/cpufreq/amd-pstate.c
|
||||
@@ -1880,12 +1880,6 @@ static int __init amd_pstate_init(void)
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
- ret = amd_pstate_register_driver(cppc_state);
|
||||
- if (ret) {
|
||||
- pr_err("failed to register with return %d\n", ret);
|
||||
- return ret;
|
||||
- }
|
||||
-
|
||||
/* capability check */
|
||||
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
|
||||
pr_debug("AMD CPPC MSR based functionality is supported\n");
|
||||
@@ -1896,6 +1890,12 @@ static int __init amd_pstate_init(void)
|
||||
static_call_update(amd_pstate_update_perf, shmem_update_perf);
|
||||
}
|
||||
|
||||
+ ret = amd_pstate_register_driver(cppc_state);
|
||||
+ if (ret) {
|
||||
+ pr_err("failed to register with return %d\n", ret);
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
if (amd_pstate_prefcore) {
|
||||
ret = amd_detect_prefcore(&amd_pstate_prefcore);
|
||||
if (ret)
|
321
debian/patches/patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch
vendored
Normal file
321
debian/patches/patchset-pf/amd-rapl/0001-perf-Generic-hotplug-support-for-a-PMU-with-a-scope.patch
vendored
Normal file
@@ -0,0 +1,321 @@
|
||||
From 023d6b8aa8d8b346cfdcccf5ca4cb880c8d41d87 Mon Sep 17 00:00:00 2001
|
||||
From: Kan Liang <kan.liang@linux.intel.com>
|
||||
Date: Fri, 2 Aug 2024 08:16:37 -0700
|
||||
Subject: perf: Generic hotplug support for a PMU with a scope
|
||||
|
||||
The perf subsystem assumes that the counters of a PMU are per-CPU. So
|
||||
the user space tool reads a counter from each CPU in the system wide
|
||||
mode. However, many PMUs don't have a per-CPU counter. The counter is
|
||||
effective for a scope, e.g., a die or a socket. To address this, a
|
||||
cpumask is exposed by the kernel driver to restrict to one CPU to stand
|
||||
for a specific scope. In case the given CPU is removed,
|
||||
the hotplug support has to be implemented for each such driver.
|
||||
|
||||
The codes to support the cpumask and hotplug are very similar.
|
||||
- Expose a cpumask into sysfs
|
||||
- Pickup another CPU in the same scope if the given CPU is removed.
|
||||
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
|
||||
- In event init, always set the CPU in the cpumask to event->cpu
|
||||
|
||||
Similar duplicated codes are implemented for each such PMU driver. It
|
||||
would be good to introduce a generic infrastructure to avoid such
|
||||
duplication.
|
||||
|
||||
5 popular scopes are implemented here, core, die, cluster, pkg, and
|
||||
the system-wide. The scope can be set when a PMU is registered. If so, a
|
||||
"cpumask" is automatically exposed for the PMU.
|
||||
|
||||
The "cpumask" is from the perf_online_<scope>_mask, which is to track
|
||||
the active CPU for each scope. They are set when the first CPU of the
|
||||
scope is online via the generic perf hotplug support. When a
|
||||
corresponding CPU is removed, the perf_online_<scope>_mask is updated
|
||||
accordingly and the PMU will be moved to a new CPU from the same scope
|
||||
if possible.
|
||||
|
||||
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
|
||||
---
|
||||
include/linux/perf_event.h | 18 ++++
|
||||
kernel/events/core.c | 164 ++++++++++++++++++++++++++++++++++++-
|
||||
2 files changed, 180 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/include/linux/perf_event.h
|
||||
+++ b/include/linux/perf_event.h
|
||||
@@ -292,6 +292,19 @@ struct perf_event_pmu_context;
|
||||
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
|
||||
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
|
||||
|
||||
+/**
|
||||
+ * pmu::scope
|
||||
+ */
|
||||
+enum perf_pmu_scope {
|
||||
+ PERF_PMU_SCOPE_NONE = 0,
|
||||
+ PERF_PMU_SCOPE_CORE,
|
||||
+ PERF_PMU_SCOPE_DIE,
|
||||
+ PERF_PMU_SCOPE_CLUSTER,
|
||||
+ PERF_PMU_SCOPE_PKG,
|
||||
+ PERF_PMU_SCOPE_SYS_WIDE,
|
||||
+ PERF_PMU_MAX_SCOPE,
|
||||
+};
|
||||
+
|
||||
struct perf_output_handle;
|
||||
|
||||
#define PMU_NULL_DEV ((void *)(~0UL))
|
||||
@@ -315,6 +328,11 @@ struct pmu {
|
||||
*/
|
||||
int capabilities;
|
||||
|
||||
+ /*
|
||||
+ * PMU scope
|
||||
+ */
|
||||
+ unsigned int scope;
|
||||
+
|
||||
int __percpu *pmu_disable_count;
|
||||
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
|
||||
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
|
||||
--- a/kernel/events/core.c
|
||||
+++ b/kernel/events/core.c
|
||||
@@ -411,6 +411,11 @@ static LIST_HEAD(pmus);
|
||||
static DEFINE_MUTEX(pmus_lock);
|
||||
static struct srcu_struct pmus_srcu;
|
||||
static cpumask_var_t perf_online_mask;
|
||||
+static cpumask_var_t perf_online_core_mask;
|
||||
+static cpumask_var_t perf_online_die_mask;
|
||||
+static cpumask_var_t perf_online_cluster_mask;
|
||||
+static cpumask_var_t perf_online_pkg_mask;
|
||||
+static cpumask_var_t perf_online_sys_mask;
|
||||
static struct kmem_cache *perf_event_cache;
|
||||
|
||||
/*
|
||||
@@ -11497,10 +11502,60 @@ perf_event_mux_interval_ms_store(struct
|
||||
}
|
||||
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
|
||||
|
||||
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
|
||||
+{
|
||||
+ switch (scope) {
|
||||
+ case PERF_PMU_SCOPE_CORE:
|
||||
+ return topology_sibling_cpumask(cpu);
|
||||
+ case PERF_PMU_SCOPE_DIE:
|
||||
+ return topology_die_cpumask(cpu);
|
||||
+ case PERF_PMU_SCOPE_CLUSTER:
|
||||
+ return topology_cluster_cpumask(cpu);
|
||||
+ case PERF_PMU_SCOPE_PKG:
|
||||
+ return topology_core_cpumask(cpu);
|
||||
+ case PERF_PMU_SCOPE_SYS_WIDE:
|
||||
+ return cpu_online_mask;
|
||||
+ }
|
||||
+
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
|
||||
+{
|
||||
+ switch (scope) {
|
||||
+ case PERF_PMU_SCOPE_CORE:
|
||||
+ return perf_online_core_mask;
|
||||
+ case PERF_PMU_SCOPE_DIE:
|
||||
+ return perf_online_die_mask;
|
||||
+ case PERF_PMU_SCOPE_CLUSTER:
|
||||
+ return perf_online_cluster_mask;
|
||||
+ case PERF_PMU_SCOPE_PKG:
|
||||
+ return perf_online_pkg_mask;
|
||||
+ case PERF_PMU_SCOPE_SYS_WIDE:
|
||||
+ return perf_online_sys_mask;
|
||||
+ }
|
||||
+
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
|
||||
+ char *buf)
|
||||
+{
|
||||
+ struct pmu *pmu = dev_get_drvdata(dev);
|
||||
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
|
||||
+
|
||||
+ if (mask)
|
||||
+ return cpumap_print_to_pagebuf(true, buf, mask);
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static DEVICE_ATTR_RO(cpumask);
|
||||
+
|
||||
static struct attribute *pmu_dev_attrs[] = {
|
||||
&dev_attr_type.attr,
|
||||
&dev_attr_perf_event_mux_interval_ms.attr,
|
||||
&dev_attr_nr_addr_filters.attr,
|
||||
+ &dev_attr_cpumask.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@@ -11512,6 +11567,10 @@ static umode_t pmu_dev_is_visible(struct
|
||||
if (n == 2 && !pmu->nr_addr_filters)
|
||||
return 0;
|
||||
|
||||
+ /* cpumask */
|
||||
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
|
||||
+ return 0;
|
||||
+
|
||||
return a->mode;
|
||||
}
|
||||
|
||||
@@ -11596,6 +11655,11 @@ int perf_pmu_register(struct pmu *pmu, c
|
||||
goto free_pdc;
|
||||
}
|
||||
|
||||
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
|
||||
+ ret = -EINVAL;
|
||||
+ goto free_pdc;
|
||||
+ }
|
||||
+
|
||||
pmu->name = name;
|
||||
|
||||
if (type >= 0)
|
||||
@@ -11750,6 +11814,22 @@ static int perf_try_init_event(struct pm
|
||||
event_has_any_exclude_flag(event))
|
||||
ret = -EINVAL;
|
||||
|
||||
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
|
||||
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
|
||||
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
|
||||
+ int cpu;
|
||||
+
|
||||
+ if (pmu_cpumask && cpumask) {
|
||||
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
|
||||
+ if (cpu >= nr_cpu_ids)
|
||||
+ ret = -ENODEV;
|
||||
+ else
|
||||
+ event->cpu = cpu;
|
||||
+ } else {
|
||||
+ ret = -ENODEV;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
if (ret && event->destroy)
|
||||
event->destroy(event);
|
||||
}
|
||||
@@ -13713,6 +13793,12 @@ static void __init perf_event_init_all_c
|
||||
int cpu;
|
||||
|
||||
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
|
||||
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
|
||||
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
|
||||
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
|
||||
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
|
||||
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
|
||||
+
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
swhash = &per_cpu(swevent_htable, cpu);
|
||||
@@ -13762,6 +13848,40 @@ static void __perf_event_exit_context(vo
|
||||
raw_spin_unlock(&ctx->lock);
|
||||
}
|
||||
|
||||
+static void perf_event_clear_cpumask(unsigned int cpu)
|
||||
+{
|
||||
+ int target[PERF_PMU_MAX_SCOPE];
|
||||
+ unsigned int scope;
|
||||
+ struct pmu *pmu;
|
||||
+
|
||||
+ cpumask_clear_cpu(cpu, perf_online_mask);
|
||||
+
|
||||
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
|
||||
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
|
||||
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
|
||||
+
|
||||
+ target[scope] = -1;
|
||||
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
|
||||
+ continue;
|
||||
+
|
||||
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
|
||||
+ continue;
|
||||
+ target[scope] = cpumask_any_but(cpumask, cpu);
|
||||
+ if (target[scope] < nr_cpu_ids)
|
||||
+ cpumask_set_cpu(target[scope], pmu_cpumask);
|
||||
+ }
|
||||
+
|
||||
+ /* migrate */
|
||||
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
|
||||
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
|
||||
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
|
||||
+ continue;
|
||||
+
|
||||
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
|
||||
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static void perf_event_exit_cpu_context(int cpu)
|
||||
{
|
||||
struct perf_cpu_context *cpuctx;
|
||||
@@ -13769,6 +13889,11 @@ static void perf_event_exit_cpu_context(
|
||||
|
||||
// XXX simplify cpuctx->online
|
||||
mutex_lock(&pmus_lock);
|
||||
+ /*
|
||||
+ * Clear the cpumasks, and migrate to other CPUs if possible.
|
||||
+ * Must be invoked before the __perf_event_exit_context.
|
||||
+ */
|
||||
+ perf_event_clear_cpumask(cpu);
|
||||
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
|
||||
ctx = &cpuctx->ctx;
|
||||
|
||||
@@ -13776,7 +13901,6 @@ static void perf_event_exit_cpu_context(
|
||||
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
|
||||
cpuctx->online = 0;
|
||||
mutex_unlock(&ctx->mutex);
|
||||
- cpumask_clear_cpu(cpu, perf_online_mask);
|
||||
mutex_unlock(&pmus_lock);
|
||||
}
|
||||
#else
|
||||
@@ -13785,6 +13909,42 @@ static void perf_event_exit_cpu_context(
|
||||
|
||||
#endif
|
||||
|
||||
+static void perf_event_setup_cpumask(unsigned int cpu)
|
||||
+{
|
||||
+ struct cpumask *pmu_cpumask;
|
||||
+ unsigned int scope;
|
||||
+
|
||||
+ cpumask_set_cpu(cpu, perf_online_mask);
|
||||
+
|
||||
+ /*
|
||||
+ * Early boot stage, the cpumask hasn't been set yet.
|
||||
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
|
||||
+ * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
|
||||
+ */
|
||||
+ if (!topology_sibling_cpumask(cpu)) {
|
||||
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
|
||||
+ pmu_cpumask = perf_scope_cpumask(scope);
|
||||
+ if (WARN_ON_ONCE(!pmu_cpumask))
|
||||
+ continue;
|
||||
+ cpumask_set_cpu(cpu, pmu_cpumask);
|
||||
+ }
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
|
||||
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
|
||||
+
|
||||
+ pmu_cpumask = perf_scope_cpumask(scope);
|
||||
+
|
||||
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
|
||||
+ continue;
|
||||
+
|
||||
+ if (!cpumask_empty(cpumask) &&
|
||||
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
|
||||
+ cpumask_set_cpu(cpu, pmu_cpumask);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
int perf_event_init_cpu(unsigned int cpu)
|
||||
{
|
||||
struct perf_cpu_context *cpuctx;
|
||||
@@ -13793,7 +13953,7 @@ int perf_event_init_cpu(unsigned int cpu
|
||||
perf_swevent_init_cpu(cpu);
|
||||
|
||||
mutex_lock(&pmus_lock);
|
||||
- cpumask_set_cpu(cpu, perf_online_mask);
|
||||
+ perf_event_setup_cpumask(cpu);
|
||||
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
|
||||
ctx = &cpuctx->ctx;
|
||||
|
71
debian/patches/patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch
vendored
Normal file
71
debian/patches/patchset-pf/amd-rapl/0002-perf-Add-PERF_EV_CAP_READ_SCOPE.patch
vendored
Normal file
@@ -0,0 +1,71 @@
|
||||
From 8c7eb17e722a6a45c4436e5debb9336089b21d9b Mon Sep 17 00:00:00 2001
|
||||
From: Kan Liang <kan.liang@linux.intel.com>
|
||||
Date: Fri, 2 Aug 2024 08:16:38 -0700
|
||||
Subject: perf: Add PERF_EV_CAP_READ_SCOPE
|
||||
|
||||
Usually, an event can be read from any CPU of the scope. It doesn't need
|
||||
to be read from the advertised CPU.
|
||||
|
||||
Add a new event cap, PERF_EV_CAP_READ_SCOPE. An event of a PMU with
|
||||
scope can be read from any active CPU in the scope.
|
||||
|
||||
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
|
||||
---
|
||||
include/linux/perf_event.h | 3 +++
|
||||
kernel/events/core.c | 14 +++++++++++---
|
||||
2 files changed, 14 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/include/linux/perf_event.h
|
||||
+++ b/include/linux/perf_event.h
|
||||
@@ -633,10 +633,13 @@ typedef void (*perf_overflow_handler_t)(
|
||||
* PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
|
||||
* cannot be a group leader. If an event with this flag is detached from the
|
||||
* group it is scheduled out and moved into an unrecoverable ERROR state.
|
||||
+ * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
|
||||
+ * PMU scope where it is active.
|
||||
*/
|
||||
#define PERF_EV_CAP_SOFTWARE BIT(0)
|
||||
#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1)
|
||||
#define PERF_EV_CAP_SIBLING BIT(2)
|
||||
+#define PERF_EV_CAP_READ_SCOPE BIT(3)
|
||||
|
||||
#define SWEVENT_HLIST_BITS 8
|
||||
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
|
||||
--- a/kernel/events/core.c
|
||||
+++ b/kernel/events/core.c
|
||||
@@ -4477,16 +4477,24 @@ struct perf_read_data {
|
||||
int ret;
|
||||
};
|
||||
|
||||
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
|
||||
+
|
||||
static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
|
||||
{
|
||||
+ int local_cpu = smp_processor_id();
|
||||
u16 local_pkg, event_pkg;
|
||||
|
||||
if ((unsigned)event_cpu >= nr_cpu_ids)
|
||||
return event_cpu;
|
||||
|
||||
- if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
|
||||
- int local_cpu = smp_processor_id();
|
||||
+ if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
|
||||
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
|
||||
+
|
||||
+ if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
|
||||
+ return local_cpu;
|
||||
+ }
|
||||
|
||||
+ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
|
||||
event_pkg = topology_physical_package_id(event_cpu);
|
||||
local_pkg = topology_physical_package_id(local_cpu);
|
||||
|
||||
@@ -11824,7 +11832,7 @@ static int perf_try_init_event(struct pm
|
||||
if (cpu >= nr_cpu_ids)
|
||||
ret = -ENODEV;
|
||||
else
|
||||
- event->cpu = cpu;
|
||||
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
|
||||
} else {
|
||||
ret = -ENODEV;
|
||||
}
|
286
debian/patches/patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch
vendored
Normal file
286
debian/patches/patchset-pf/amd-rapl/0003-perf-x86-intel-cstate-Clean-up-cpumask-and-hotplug.patch
vendored
Normal file
@@ -0,0 +1,286 @@
|
||||
From 09c1529eb102b486220c35546f2663ca858a2943 Mon Sep 17 00:00:00 2001
|
||||
From: Kan Liang <kan.liang@linux.intel.com>
|
||||
Date: Fri, 2 Aug 2024 08:16:39 -0700
|
||||
Subject: perf/x86/intel/cstate: Clean up cpumask and hotplug
|
||||
|
||||
There are three cstate PMUs with different scopes, core, die and module.
|
||||
The scopes are supported by the generic perf_event subsystem now.
|
||||
|
||||
Set the scope for each PMU and remove all the cpumask and hotplug codes.
|
||||
|
||||
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
|
||||
---
|
||||
arch/x86/events/intel/cstate.c | 142 ++-------------------------------
|
||||
include/linux/cpuhotplug.h | 2 -
|
||||
2 files changed, 5 insertions(+), 139 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/intel/cstate.c
|
||||
+++ b/arch/x86/events/intel/cstate.c
|
||||
@@ -128,10 +128,6 @@ static ssize_t __cstate_##_var##_show(st
|
||||
static struct device_attribute format_attr_##_var = \
|
||||
__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
|
||||
|
||||
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
|
||||
- struct device_attribute *attr,
|
||||
- char *buf);
|
||||
-
|
||||
/* Model -> events mapping */
|
||||
struct cstate_model {
|
||||
unsigned long core_events;
|
||||
@@ -206,22 +202,9 @@ static struct attribute_group cstate_for
|
||||
.attrs = cstate_format_attrs,
|
||||
};
|
||||
|
||||
-static cpumask_t cstate_core_cpu_mask;
|
||||
-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
|
||||
-
|
||||
-static struct attribute *cstate_cpumask_attrs[] = {
|
||||
- &dev_attr_cpumask.attr,
|
||||
- NULL,
|
||||
-};
|
||||
-
|
||||
-static struct attribute_group cpumask_attr_group = {
|
||||
- .attrs = cstate_cpumask_attrs,
|
||||
-};
|
||||
-
|
||||
static const struct attribute_group *cstate_attr_groups[] = {
|
||||
&cstate_events_attr_group,
|
||||
&cstate_format_attr_group,
|
||||
- &cpumask_attr_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@@ -269,8 +252,6 @@ static struct perf_msr pkg_msr[] = {
|
||||
[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr },
|
||||
};
|
||||
|
||||
-static cpumask_t cstate_pkg_cpu_mask;
|
||||
-
|
||||
/* cstate_module PMU */
|
||||
static struct pmu cstate_module_pmu;
|
||||
static bool has_cstate_module;
|
||||
@@ -291,28 +272,9 @@ static struct perf_msr module_msr[] = {
|
||||
[PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr },
|
||||
};
|
||||
|
||||
-static cpumask_t cstate_module_cpu_mask;
|
||||
-
|
||||
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
|
||||
- struct device_attribute *attr,
|
||||
- char *buf)
|
||||
-{
|
||||
- struct pmu *pmu = dev_get_drvdata(dev);
|
||||
-
|
||||
- if (pmu == &cstate_core_pmu)
|
||||
- return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
|
||||
- else if (pmu == &cstate_pkg_pmu)
|
||||
- return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
|
||||
- else if (pmu == &cstate_module_pmu)
|
||||
- return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask);
|
||||
- else
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static int cstate_pmu_event_init(struct perf_event *event)
|
||||
{
|
||||
u64 cfg = event->attr.config;
|
||||
- int cpu;
|
||||
|
||||
if (event->attr.type != event->pmu->type)
|
||||
return -ENOENT;
|
||||
@@ -331,20 +293,13 @@ static int cstate_pmu_event_init(struct
|
||||
if (!(core_msr_mask & (1 << cfg)))
|
||||
return -EINVAL;
|
||||
event->hw.event_base = core_msr[cfg].msr;
|
||||
- cpu = cpumask_any_and(&cstate_core_cpu_mask,
|
||||
- topology_sibling_cpumask(event->cpu));
|
||||
} else if (event->pmu == &cstate_pkg_pmu) {
|
||||
if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
|
||||
return -EINVAL;
|
||||
cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
|
||||
if (!(pkg_msr_mask & (1 << cfg)))
|
||||
return -EINVAL;
|
||||
-
|
||||
- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
|
||||
-
|
||||
event->hw.event_base = pkg_msr[cfg].msr;
|
||||
- cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
|
||||
- topology_die_cpumask(event->cpu));
|
||||
} else if (event->pmu == &cstate_module_pmu) {
|
||||
if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
|
||||
return -EINVAL;
|
||||
@@ -352,16 +307,10 @@ static int cstate_pmu_event_init(struct
|
||||
if (!(module_msr_mask & (1 << cfg)))
|
||||
return -EINVAL;
|
||||
event->hw.event_base = module_msr[cfg].msr;
|
||||
- cpu = cpumask_any_and(&cstate_module_cpu_mask,
|
||||
- topology_cluster_cpumask(event->cpu));
|
||||
} else {
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
- if (cpu >= nr_cpu_ids)
|
||||
- return -ENODEV;
|
||||
-
|
||||
- event->cpu = cpu;
|
||||
event->hw.config = cfg;
|
||||
event->hw.idx = -1;
|
||||
return 0;
|
||||
@@ -412,84 +361,6 @@ static int cstate_pmu_event_add(struct p
|
||||
return 0;
|
||||
}
|
||||
|
||||
-/*
|
||||
- * Check if exiting cpu is the designated reader. If so migrate the
|
||||
- * events when there is a valid target available
|
||||
- */
|
||||
-static int cstate_cpu_exit(unsigned int cpu)
|
||||
-{
|
||||
- unsigned int target;
|
||||
-
|
||||
- if (has_cstate_core &&
|
||||
- cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {
|
||||
-
|
||||
- target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
|
||||
- /* Migrate events if there is a valid target */
|
||||
- if (target < nr_cpu_ids) {
|
||||
- cpumask_set_cpu(target, &cstate_core_cpu_mask);
|
||||
- perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- if (has_cstate_pkg &&
|
||||
- cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {
|
||||
-
|
||||
- target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
|
||||
- /* Migrate events if there is a valid target */
|
||||
- if (target < nr_cpu_ids) {
|
||||
- cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
|
||||
- perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- if (has_cstate_module &&
|
||||
- cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) {
|
||||
-
|
||||
- target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu);
|
||||
- /* Migrate events if there is a valid target */
|
||||
- if (target < nr_cpu_ids) {
|
||||
- cpumask_set_cpu(target, &cstate_module_cpu_mask);
|
||||
- perf_pmu_migrate_context(&cstate_module_pmu, cpu, target);
|
||||
- }
|
||||
- }
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static int cstate_cpu_init(unsigned int cpu)
|
||||
-{
|
||||
- unsigned int target;
|
||||
-
|
||||
- /*
|
||||
- * If this is the first online thread of that core, set it in
|
||||
- * the core cpu mask as the designated reader.
|
||||
- */
|
||||
- target = cpumask_any_and(&cstate_core_cpu_mask,
|
||||
- topology_sibling_cpumask(cpu));
|
||||
-
|
||||
- if (has_cstate_core && target >= nr_cpu_ids)
|
||||
- cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
|
||||
-
|
||||
- /*
|
||||
- * If this is the first online thread of that package, set it
|
||||
- * in the package cpu mask as the designated reader.
|
||||
- */
|
||||
- target = cpumask_any_and(&cstate_pkg_cpu_mask,
|
||||
- topology_die_cpumask(cpu));
|
||||
- if (has_cstate_pkg && target >= nr_cpu_ids)
|
||||
- cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
|
||||
-
|
||||
- /*
|
||||
- * If this is the first online thread of that cluster, set it
|
||||
- * in the cluster cpu mask as the designated reader.
|
||||
- */
|
||||
- target = cpumask_any_and(&cstate_module_cpu_mask,
|
||||
- topology_cluster_cpumask(cpu));
|
||||
- if (has_cstate_module && target >= nr_cpu_ids)
|
||||
- cpumask_set_cpu(cpu, &cstate_module_cpu_mask);
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static const struct attribute_group *core_attr_update[] = {
|
||||
&group_cstate_core_c1,
|
||||
&group_cstate_core_c3,
|
||||
@@ -526,6 +397,7 @@ static struct pmu cstate_core_pmu = {
|
||||
.stop = cstate_pmu_event_stop,
|
||||
.read = cstate_pmu_event_update,
|
||||
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
|
||||
+ .scope = PERF_PMU_SCOPE_CORE,
|
||||
.module = THIS_MODULE,
|
||||
};
|
||||
|
||||
@@ -541,6 +413,7 @@ static struct pmu cstate_pkg_pmu = {
|
||||
.stop = cstate_pmu_event_stop,
|
||||
.read = cstate_pmu_event_update,
|
||||
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
|
||||
+ .scope = PERF_PMU_SCOPE_PKG,
|
||||
.module = THIS_MODULE,
|
||||
};
|
||||
|
||||
@@ -556,6 +429,7 @@ static struct pmu cstate_module_pmu = {
|
||||
.stop = cstate_pmu_event_stop,
|
||||
.read = cstate_pmu_event_update,
|
||||
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
|
||||
+ .scope = PERF_PMU_SCOPE_CLUSTER,
|
||||
.module = THIS_MODULE,
|
||||
};
|
||||
|
||||
@@ -810,9 +684,6 @@ static int __init cstate_probe(const str
|
||||
|
||||
static inline void cstate_cleanup(void)
|
||||
{
|
||||
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE);
|
||||
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING);
|
||||
-
|
||||
if (has_cstate_core)
|
||||
perf_pmu_unregister(&cstate_core_pmu);
|
||||
|
||||
@@ -827,11 +698,6 @@ static int __init cstate_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING,
|
||||
- "perf/x86/cstate:starting", cstate_cpu_init, NULL);
|
||||
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE,
|
||||
- "perf/x86/cstate:online", NULL, cstate_cpu_exit);
|
||||
-
|
||||
if (has_cstate_core) {
|
||||
err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
|
||||
if (err) {
|
||||
@@ -844,6 +710,8 @@ static int __init cstate_init(void)
|
||||
|
||||
if (has_cstate_pkg) {
|
||||
if (topology_max_dies_per_package() > 1) {
|
||||
+ /* CLX-AP is multi-die and the cstate is die-scope */
|
||||
+ cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE;
|
||||
err = perf_pmu_register(&cstate_pkg_pmu,
|
||||
"cstate_die", -1);
|
||||
} else {
|
||||
--- a/include/linux/cpuhotplug.h
|
||||
+++ b/include/linux/cpuhotplug.h
|
||||
@@ -152,7 +152,6 @@ enum cpuhp_state {
|
||||
CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
|
||||
CPUHP_AP_PERF_X86_STARTING,
|
||||
CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
|
||||
- CPUHP_AP_PERF_X86_CSTATE_STARTING,
|
||||
CPUHP_AP_PERF_XTENSA_STARTING,
|
||||
CPUHP_AP_ARM_VFP_STARTING,
|
||||
CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
|
||||
@@ -209,7 +208,6 @@ enum cpuhp_state {
|
||||
CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
|
||||
CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
|
||||
CPUHP_AP_PERF_X86_RAPL_ONLINE,
|
||||
- CPUHP_AP_PERF_X86_CSTATE_ONLINE,
|
||||
CPUHP_AP_PERF_S390_CF_ONLINE,
|
||||
CPUHP_AP_PERF_S390_SF_ONLINE,
|
||||
CPUHP_AP_PERF_ARM_CCI_ONLINE,
|
188
debian/patches/patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch
vendored
Normal file
188
debian/patches/patchset-pf/amd-rapl/0004-iommu-vt-d-Clean-up-cpumask-and-hotplug-for-perfmon.patch
vendored
Normal file
@@ -0,0 +1,188 @@
|
||||
From f91da33af8295b4b3d73a2083225f69e1d5ff301 Mon Sep 17 00:00:00 2001
|
||||
From: Kan Liang <kan.liang@linux.intel.com>
|
||||
Date: Fri, 2 Aug 2024 08:16:40 -0700
|
||||
Subject: iommu/vt-d: Clean up cpumask and hotplug for perfmon
|
||||
|
||||
The iommu PMU is system-wide scope, which is supported by the generic
|
||||
perf_event subsystem now.
|
||||
|
||||
Set the scope for the iommu PMU and remove all the cpumask and hotplug
|
||||
codes.
|
||||
|
||||
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
|
||||
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
|
||||
Cc: David Woodhouse <dwmw2@infradead.org>
|
||||
Cc: Joerg Roedel <joro@8bytes.org>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Cc: iommu@lists.linux.dev
|
||||
---
|
||||
drivers/iommu/intel/iommu.h | 2 -
|
||||
drivers/iommu/intel/perfmon.c | 111 +---------------------------------
|
||||
2 files changed, 2 insertions(+), 111 deletions(-)
|
||||
|
||||
--- a/drivers/iommu/intel/iommu.h
|
||||
+++ b/drivers/iommu/intel/iommu.h
|
||||
@@ -687,8 +687,6 @@ struct iommu_pmu {
|
||||
DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
|
||||
struct perf_event *event_list[IOMMU_PMU_IDX_MAX];
|
||||
unsigned char irq_name[16];
|
||||
- struct hlist_node cpuhp_node;
|
||||
- int cpu;
|
||||
};
|
||||
|
||||
#define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED)
|
||||
--- a/drivers/iommu/intel/perfmon.c
|
||||
+++ b/drivers/iommu/intel/perfmon.c
|
||||
@@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_
|
||||
.attrs = attrs_empty,
|
||||
};
|
||||
|
||||
-static cpumask_t iommu_pmu_cpu_mask;
|
||||
-
|
||||
-static ssize_t
|
||||
-cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
|
||||
-{
|
||||
- return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
|
||||
-}
|
||||
-static DEVICE_ATTR_RO(cpumask);
|
||||
-
|
||||
-static struct attribute *iommu_pmu_cpumask_attrs[] = {
|
||||
- &dev_attr_cpumask.attr,
|
||||
- NULL
|
||||
-};
|
||||
-
|
||||
-static struct attribute_group iommu_pmu_cpumask_attr_group = {
|
||||
- .attrs = iommu_pmu_cpumask_attrs,
|
||||
-};
|
||||
-
|
||||
static const struct attribute_group *iommu_pmu_attr_groups[] = {
|
||||
&iommu_pmu_format_attr_group,
|
||||
&iommu_pmu_events_attr_group,
|
||||
- &iommu_pmu_cpumask_attr_group,
|
||||
NULL
|
||||
};
|
||||
|
||||
@@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct i
|
||||
iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups;
|
||||
iommu_pmu->pmu.attr_update = iommu_pmu_attr_update;
|
||||
iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
|
||||
+ iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE;
|
||||
iommu_pmu->pmu.module = THIS_MODULE;
|
||||
|
||||
return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
|
||||
@@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(st
|
||||
iommu->perf_irq = 0;
|
||||
}
|
||||
|
||||
-static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
|
||||
-{
|
||||
- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
|
||||
-
|
||||
- if (cpumask_empty(&iommu_pmu_cpu_mask))
|
||||
- cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
|
||||
-
|
||||
- if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
|
||||
- iommu_pmu->cpu = cpu;
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
|
||||
-{
|
||||
- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
|
||||
- int target = cpumask_first(&iommu_pmu_cpu_mask);
|
||||
-
|
||||
- /*
|
||||
- * The iommu_pmu_cpu_mask has been updated when offline the CPU
|
||||
- * for the first iommu_pmu. Migrate the other iommu_pmu to the
|
||||
- * new target.
|
||||
- */
|
||||
- if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
|
||||
- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
|
||||
- iommu_pmu->cpu = target;
|
||||
- return 0;
|
||||
- }
|
||||
-
|
||||
- if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
|
||||
- return 0;
|
||||
-
|
||||
- target = cpumask_any_but(cpu_online_mask, cpu);
|
||||
-
|
||||
- if (target < nr_cpu_ids)
|
||||
- cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
|
||||
- else
|
||||
- return 0;
|
||||
-
|
||||
- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
|
||||
- iommu_pmu->cpu = target;
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static int nr_iommu_pmu;
|
||||
-static enum cpuhp_state iommu_cpuhp_slot;
|
||||
-
|
||||
-static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
|
||||
-{
|
||||
- int ret;
|
||||
-
|
||||
- if (!nr_iommu_pmu) {
|
||||
- ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
|
||||
- "driver/iommu/intel/perfmon:online",
|
||||
- iommu_pmu_cpu_online,
|
||||
- iommu_pmu_cpu_offline);
|
||||
- if (ret < 0)
|
||||
- return ret;
|
||||
- iommu_cpuhp_slot = ret;
|
||||
- }
|
||||
-
|
||||
- ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
|
||||
- if (ret) {
|
||||
- if (!nr_iommu_pmu)
|
||||
- cpuhp_remove_multi_state(iommu_cpuhp_slot);
|
||||
- return ret;
|
||||
- }
|
||||
- nr_iommu_pmu++;
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
|
||||
-{
|
||||
- cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
|
||||
-
|
||||
- if (--nr_iommu_pmu)
|
||||
- return;
|
||||
-
|
||||
- cpuhp_remove_multi_state(iommu_cpuhp_slot);
|
||||
-}
|
||||
-
|
||||
void iommu_pmu_register(struct intel_iommu *iommu)
|
||||
{
|
||||
struct iommu_pmu *iommu_pmu = iommu->pmu;
|
||||
@@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iom
|
||||
if (__iommu_pmu_register(iommu))
|
||||
goto err;
|
||||
|
||||
- if (iommu_pmu_cpuhp_setup(iommu_pmu))
|
||||
- goto unregister;
|
||||
-
|
||||
/* Set interrupt for overflow */
|
||||
if (iommu_pmu_set_interrupt(iommu))
|
||||
- goto cpuhp_free;
|
||||
+ goto unregister;
|
||||
|
||||
return;
|
||||
|
||||
-cpuhp_free:
|
||||
- iommu_pmu_cpuhp_free(iommu_pmu);
|
||||
unregister:
|
||||
perf_pmu_unregister(&iommu_pmu->pmu);
|
||||
err:
|
||||
@@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_i
|
||||
return;
|
||||
|
||||
iommu_pmu_unset_interrupt(iommu);
|
||||
- iommu_pmu_cpuhp_free(iommu_pmu);
|
||||
perf_pmu_unregister(&iommu_pmu->pmu);
|
||||
}
|
238
debian/patches/patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch
vendored
Normal file
238
debian/patches/patchset-pf/amd-rapl/0005-dmaengine-idxd-Clean-up-cpumask-and-hotplug-for-perf.patch
vendored
Normal file
@@ -0,0 +1,238 @@
|
||||
From 76278bd3946d618ead2d9cc22612a75a4ab99ace Mon Sep 17 00:00:00 2001
|
||||
From: Kan Liang <kan.liang@linux.intel.com>
|
||||
Date: Fri, 2 Aug 2024 08:16:41 -0700
|
||||
Subject: dmaengine: idxd: Clean up cpumask and hotplug for perfmon
|
||||
|
||||
The idxd PMU is system-wide scope, which is supported by the generic
|
||||
perf_event subsystem now.
|
||||
|
||||
Set the scope for the idxd PMU and remove all the cpumask and hotplug
|
||||
codes.
|
||||
|
||||
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
|
||||
Cc: Fenghua Yu <fenghua.yu@intel.com>
|
||||
Cc: Dave Jiang <dave.jiang@intel.com>
|
||||
Cc: Vinod Koul <vkoul@kernel.org>
|
||||
Cc: dmaengine@vger.kernel.org
|
||||
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
|
||||
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
|
||||
---
|
||||
drivers/dma/idxd/idxd.h | 7 ---
|
||||
drivers/dma/idxd/init.c | 3 --
|
||||
drivers/dma/idxd/perfmon.c | 98 +-------------------------------------
|
||||
3 files changed, 1 insertion(+), 107 deletions(-)
|
||||
|
||||
--- a/drivers/dma/idxd/idxd.h
|
||||
+++ b/drivers/dma/idxd/idxd.h
|
||||
@@ -124,7 +124,6 @@ struct idxd_pmu {
|
||||
|
||||
struct pmu pmu;
|
||||
char name[IDXD_NAME_SIZE];
|
||||
- int cpu;
|
||||
|
||||
int n_counters;
|
||||
int counter_width;
|
||||
@@ -135,8 +134,6 @@ struct idxd_pmu {
|
||||
|
||||
unsigned long supported_filters;
|
||||
int n_filters;
|
||||
-
|
||||
- struct hlist_node cpuhp_node;
|
||||
};
|
||||
|
||||
#define IDXD_MAX_PRIORITY 0xf
|
||||
@@ -803,14 +800,10 @@ void idxd_user_counter_increment(struct
|
||||
int perfmon_pmu_init(struct idxd_device *idxd);
|
||||
void perfmon_pmu_remove(struct idxd_device *idxd);
|
||||
void perfmon_counter_overflow(struct idxd_device *idxd);
|
||||
-void perfmon_init(void);
|
||||
-void perfmon_exit(void);
|
||||
#else
|
||||
static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; }
|
||||
static inline void perfmon_pmu_remove(struct idxd_device *idxd) {}
|
||||
static inline void perfmon_counter_overflow(struct idxd_device *idxd) {}
|
||||
-static inline void perfmon_init(void) {}
|
||||
-static inline void perfmon_exit(void) {}
|
||||
#endif
|
||||
|
||||
/* debugfs */
|
||||
--- a/drivers/dma/idxd/init.c
|
||||
+++ b/drivers/dma/idxd/init.c
|
||||
@@ -878,8 +878,6 @@ static int __init idxd_init_module(void)
|
||||
else
|
||||
support_enqcmd = true;
|
||||
|
||||
- perfmon_init();
|
||||
-
|
||||
err = idxd_driver_register(&idxd_drv);
|
||||
if (err < 0)
|
||||
goto err_idxd_driver_register;
|
||||
@@ -928,7 +926,6 @@ static void __exit idxd_exit_module(void
|
||||
idxd_driver_unregister(&idxd_drv);
|
||||
pci_unregister_driver(&idxd_pci_driver);
|
||||
idxd_cdev_remove();
|
||||
- perfmon_exit();
|
||||
idxd_remove_debugfs();
|
||||
}
|
||||
module_exit(idxd_exit_module);
|
||||
--- a/drivers/dma/idxd/perfmon.c
|
||||
+++ b/drivers/dma/idxd/perfmon.c
|
||||
@@ -6,29 +6,6 @@
|
||||
#include "idxd.h"
|
||||
#include "perfmon.h"
|
||||
|
||||
-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
|
||||
- char *buf);
|
||||
-
|
||||
-static cpumask_t perfmon_dsa_cpu_mask;
|
||||
-static bool cpuhp_set_up;
|
||||
-static enum cpuhp_state cpuhp_slot;
|
||||
-
|
||||
-/*
|
||||
- * perf userspace reads this attribute to determine which cpus to open
|
||||
- * counters on. It's connected to perfmon_dsa_cpu_mask, which is
|
||||
- * maintained by the cpu hotplug handlers.
|
||||
- */
|
||||
-static DEVICE_ATTR_RO(cpumask);
|
||||
-
|
||||
-static struct attribute *perfmon_cpumask_attrs[] = {
|
||||
- &dev_attr_cpumask.attr,
|
||||
- NULL,
|
||||
-};
|
||||
-
|
||||
-static struct attribute_group cpumask_attr_group = {
|
||||
- .attrs = perfmon_cpumask_attrs,
|
||||
-};
|
||||
-
|
||||
/*
|
||||
* These attributes specify the bits in the config word that the perf
|
||||
* syscall uses to pass the event ids and categories to perfmon.
|
||||
@@ -67,16 +44,9 @@ static struct attribute_group perfmon_fo
|
||||
|
||||
static const struct attribute_group *perfmon_attr_groups[] = {
|
||||
&perfmon_format_attr_group,
|
||||
- &cpumask_attr_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
|
||||
- char *buf)
|
||||
-{
|
||||
- return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask);
|
||||
-}
|
||||
-
|
||||
static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event)
|
||||
{
|
||||
return &idxd_pmu->pmu == event->pmu;
|
||||
@@ -217,7 +187,6 @@ static int perfmon_pmu_event_init(struct
|
||||
return -EINVAL;
|
||||
|
||||
event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd));
|
||||
- event->cpu = idxd->idxd_pmu->cpu;
|
||||
event->hw.config = event->attr.config;
|
||||
|
||||
if (event->group_leader != event)
|
||||
@@ -488,6 +457,7 @@ static void idxd_pmu_init(struct idxd_pm
|
||||
idxd_pmu->pmu.stop = perfmon_pmu_event_stop;
|
||||
idxd_pmu->pmu.read = perfmon_pmu_event_update;
|
||||
idxd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
|
||||
+ idxd_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE;
|
||||
idxd_pmu->pmu.module = THIS_MODULE;
|
||||
}
|
||||
|
||||
@@ -496,47 +466,11 @@ void perfmon_pmu_remove(struct idxd_devi
|
||||
if (!idxd->idxd_pmu)
|
||||
return;
|
||||
|
||||
- cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node);
|
||||
perf_pmu_unregister(&idxd->idxd_pmu->pmu);
|
||||
kfree(idxd->idxd_pmu);
|
||||
idxd->idxd_pmu = NULL;
|
||||
}
|
||||
|
||||
-static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node)
|
||||
-{
|
||||
- struct idxd_pmu *idxd_pmu;
|
||||
-
|
||||
- idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
|
||||
-
|
||||
- /* select the first online CPU as the designated reader */
|
||||
- if (cpumask_empty(&perfmon_dsa_cpu_mask)) {
|
||||
- cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask);
|
||||
- idxd_pmu->cpu = cpu;
|
||||
- }
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node)
|
||||
-{
|
||||
- struct idxd_pmu *idxd_pmu;
|
||||
- unsigned int target;
|
||||
-
|
||||
- idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
|
||||
-
|
||||
- if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask))
|
||||
- return 0;
|
||||
-
|
||||
- target = cpumask_any_but(cpu_online_mask, cpu);
|
||||
- /* migrate events if there is a valid target */
|
||||
- if (target < nr_cpu_ids) {
|
||||
- cpumask_set_cpu(target, &perfmon_dsa_cpu_mask);
|
||||
- perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target);
|
||||
- }
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
int perfmon_pmu_init(struct idxd_device *idxd)
|
||||
{
|
||||
union idxd_perfcap perfcap;
|
||||
@@ -544,12 +478,6 @@ int perfmon_pmu_init(struct idxd_device
|
||||
int rc = -ENODEV;
|
||||
|
||||
/*
|
||||
- * perfmon module initialization failed, nothing to do
|
||||
- */
|
||||
- if (!cpuhp_set_up)
|
||||
- return -ENODEV;
|
||||
-
|
||||
- /*
|
||||
* If perfmon_offset or num_counters is 0, it means perfmon is
|
||||
* not supported on this hardware.
|
||||
*/
|
||||
@@ -624,11 +552,6 @@ int perfmon_pmu_init(struct idxd_device
|
||||
if (rc)
|
||||
goto free;
|
||||
|
||||
- rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node);
|
||||
- if (rc) {
|
||||
- perf_pmu_unregister(&idxd->idxd_pmu->pmu);
|
||||
- goto free;
|
||||
- }
|
||||
out:
|
||||
return rc;
|
||||
free:
|
||||
@@ -637,22 +560,3 @@ free:
|
||||
|
||||
goto out;
|
||||
}
|
||||
-
|
||||
-void __init perfmon_init(void)
|
||||
-{
|
||||
- int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
|
||||
- "driver/dma/idxd/perf:online",
|
||||
- perf_event_cpu_online,
|
||||
- perf_event_cpu_offline);
|
||||
- if (WARN_ON(rc < 0))
|
||||
- return;
|
||||
-
|
||||
- cpuhp_slot = rc;
|
||||
- cpuhp_set_up = true;
|
||||
-}
|
||||
-
|
||||
-void __exit perfmon_exit(void)
|
||||
-{
|
||||
- if (cpuhp_set_up)
|
||||
- cpuhp_remove_multi_state(cpuhp_slot);
|
||||
-}
|
@@ -0,0 +1,84 @@
|
||||
From fb0a3b5932882f02ed42fcaa6db73aba3eafd6d7 Mon Sep 17 00:00:00 2001
|
||||
From: Kan Liang <kan.liang@linux.intel.com>
|
||||
Date: Fri, 2 Aug 2024 08:16:42 -0700
|
||||
Subject: perf/x86/rapl: Move the pmu allocation out of CPU hotplug
|
||||
|
||||
The rapl pmu just needs to be allocated once. It doesn't matter to be
|
||||
allocated at each CPU hotplug, or the global init_rapl_pmus().
|
||||
|
||||
Move the pmu allocation to the init_rapl_pmus(). So the generic hotplug
|
||||
supports can be applied.
|
||||
|
||||
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
|
||||
Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 44 +++++++++++++++++++++++++++++-------------
|
||||
1 file changed, 31 insertions(+), 13 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -568,19 +568,8 @@ static int rapl_cpu_online(unsigned int
|
||||
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
|
||||
int target;
|
||||
|
||||
- if (!pmu) {
|
||||
- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
|
||||
- if (!pmu)
|
||||
- return -ENOMEM;
|
||||
-
|
||||
- raw_spin_lock_init(&pmu->lock);
|
||||
- INIT_LIST_HEAD(&pmu->active_list);
|
||||
- pmu->pmu = &rapl_pmus->pmu;
|
||||
- pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
- rapl_hrtimer_init(pmu);
|
||||
-
|
||||
- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
|
||||
- }
|
||||
+ if (!pmu)
|
||||
+ return -ENOMEM;
|
||||
|
||||
/*
|
||||
* Check if there is an online cpu in the package which collects rapl
|
||||
@@ -673,6 +662,32 @@ static const struct attribute_group *rap
|
||||
NULL,
|
||||
};
|
||||
|
||||
+static void __init init_rapl_pmu(void)
|
||||
+{
|
||||
+ struct rapl_pmu *pmu;
|
||||
+ int cpu;
|
||||
+
|
||||
+ cpus_read_lock();
|
||||
+
|
||||
+ for_each_cpu(cpu, cpu_online_mask) {
|
||||
+ pmu = cpu_to_rapl_pmu(cpu);
|
||||
+ if (pmu)
|
||||
+ continue;
|
||||
+ pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
|
||||
+ if (!pmu)
|
||||
+ continue;
|
||||
+ raw_spin_lock_init(&pmu->lock);
|
||||
+ INIT_LIST_HEAD(&pmu->active_list);
|
||||
+ pmu->pmu = &rapl_pmus->pmu;
|
||||
+ pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
+ rapl_hrtimer_init(pmu);
|
||||
+
|
||||
+ rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
|
||||
+ }
|
||||
+
|
||||
+ cpus_read_unlock();
|
||||
+}
|
||||
+
|
||||
static int __init init_rapl_pmus(void)
|
||||
{
|
||||
int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
|
||||
@@ -693,6 +708,9 @@ static int __init init_rapl_pmus(void)
|
||||
rapl_pmus->pmu.read = rapl_pmu_event_read;
|
||||
rapl_pmus->pmu.module = THIS_MODULE;
|
||||
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
|
||||
+
|
||||
+ init_rapl_pmu();
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
179
debian/patches/patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch
vendored
Normal file
179
debian/patches/patchset-pf/amd-rapl/0007-perf-x86-rapl-Clean-up-cpumask-and-hotplug.patch
vendored
Normal file
@@ -0,0 +1,179 @@
|
||||
From 7b4f6ba1b1dc5f3120652bcb5921a697d5167bff Mon Sep 17 00:00:00 2001
|
||||
From: Kan Liang <kan.liang@linux.intel.com>
|
||||
Date: Fri, 2 Aug 2024 08:16:43 -0700
|
||||
Subject: perf/x86/rapl: Clean up cpumask and hotplug
|
||||
|
||||
The rapl pmu is die scope, which is supported by the generic perf_event
|
||||
subsystem now.
|
||||
|
||||
Set the scope for the rapl PMU and remove all the cpumask and hotplug
|
||||
codes.
|
||||
|
||||
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
|
||||
Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 80 +-------------------------------------
|
||||
include/linux/cpuhotplug.h | 1 -
|
||||
2 files changed, 2 insertions(+), 79 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -135,7 +135,6 @@ struct rapl_model {
|
||||
/* 1/2^hw_unit Joule */
|
||||
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
|
||||
static struct rapl_pmus *rapl_pmus;
|
||||
-static cpumask_t rapl_cpu_mask;
|
||||
static unsigned int rapl_cntr_mask;
|
||||
static u64 rapl_timer_ms;
|
||||
static struct perf_msr *rapl_msrs;
|
||||
@@ -340,8 +339,6 @@ static int rapl_pmu_event_init(struct pe
|
||||
if (event->cpu < 0)
|
||||
return -EINVAL;
|
||||
|
||||
- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
|
||||
-
|
||||
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
|
||||
return -EINVAL;
|
||||
|
||||
@@ -360,7 +357,6 @@ static int rapl_pmu_event_init(struct pe
|
||||
pmu = cpu_to_rapl_pmu(event->cpu);
|
||||
if (!pmu)
|
||||
return -EINVAL;
|
||||
- event->cpu = pmu->cpu;
|
||||
event->pmu_private = pmu;
|
||||
event->hw.event_base = rapl_msrs[bit].msr;
|
||||
event->hw.config = cfg;
|
||||
@@ -374,23 +370,6 @@ static void rapl_pmu_event_read(struct p
|
||||
rapl_event_update(event);
|
||||
}
|
||||
|
||||
-static ssize_t rapl_get_attr_cpumask(struct device *dev,
|
||||
- struct device_attribute *attr, char *buf)
|
||||
-{
|
||||
- return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
|
||||
-}
|
||||
-
|
||||
-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
|
||||
-
|
||||
-static struct attribute *rapl_pmu_attrs[] = {
|
||||
- &dev_attr_cpumask.attr,
|
||||
- NULL,
|
||||
-};
|
||||
-
|
||||
-static struct attribute_group rapl_pmu_attr_group = {
|
||||
- .attrs = rapl_pmu_attrs,
|
||||
-};
|
||||
-
|
||||
RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
|
||||
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
|
||||
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
|
||||
@@ -438,7 +417,6 @@ static struct attribute_group rapl_pmu_f
|
||||
};
|
||||
|
||||
static const struct attribute_group *rapl_attr_groups[] = {
|
||||
- &rapl_pmu_attr_group,
|
||||
&rapl_pmu_format_group,
|
||||
&rapl_pmu_events_group,
|
||||
NULL,
|
||||
@@ -541,49 +519,6 @@ static struct perf_msr amd_rapl_msrs[] =
|
||||
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
|
||||
};
|
||||
|
||||
-static int rapl_cpu_offline(unsigned int cpu)
|
||||
-{
|
||||
- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
|
||||
- int target;
|
||||
-
|
||||
- /* Check if exiting cpu is used for collecting rapl events */
|
||||
- if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
|
||||
- return 0;
|
||||
-
|
||||
- pmu->cpu = -1;
|
||||
- /* Find a new cpu to collect rapl events */
|
||||
- target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
|
||||
-
|
||||
- /* Migrate rapl events to the new target */
|
||||
- if (target < nr_cpu_ids) {
|
||||
- cpumask_set_cpu(target, &rapl_cpu_mask);
|
||||
- pmu->cpu = target;
|
||||
- perf_pmu_migrate_context(pmu->pmu, cpu, target);
|
||||
- }
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
-static int rapl_cpu_online(unsigned int cpu)
|
||||
-{
|
||||
- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
|
||||
- int target;
|
||||
-
|
||||
- if (!pmu)
|
||||
- return -ENOMEM;
|
||||
-
|
||||
- /*
|
||||
- * Check if there is an online cpu in the package which collects rapl
|
||||
- * events already.
|
||||
- */
|
||||
- target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
|
||||
- if (target < nr_cpu_ids)
|
||||
- return 0;
|
||||
-
|
||||
- cpumask_set_cpu(cpu, &rapl_cpu_mask);
|
||||
- pmu->cpu = cpu;
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
{
|
||||
u64 msr_rapl_power_unit_bits;
|
||||
@@ -707,6 +642,7 @@ static int __init init_rapl_pmus(void)
|
||||
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
|
||||
rapl_pmus->pmu.read = rapl_pmu_event_read;
|
||||
rapl_pmus->pmu.module = THIS_MODULE;
|
||||
+ rapl_pmus->pmu.scope = PERF_PMU_SCOPE_DIE;
|
||||
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
|
||||
|
||||
init_rapl_pmu();
|
||||
@@ -857,24 +793,13 @@ static int __init rapl_pmu_init(void)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
- /*
|
||||
- * Install callbacks. Core will call them for each online cpu.
|
||||
- */
|
||||
- ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
|
||||
- "perf/x86/rapl:online",
|
||||
- rapl_cpu_online, rapl_cpu_offline);
|
||||
- if (ret)
|
||||
- goto out;
|
||||
-
|
||||
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
|
||||
if (ret)
|
||||
- goto out1;
|
||||
+ goto out;
|
||||
|
||||
rapl_advertise();
|
||||
return 0;
|
||||
|
||||
-out1:
|
||||
- cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
|
||||
out:
|
||||
pr_warn("Initialization failed (%d), disabled\n", ret);
|
||||
cleanup_rapl_pmus();
|
||||
@@ -884,7 +809,6 @@ module_init(rapl_pmu_init);
|
||||
|
||||
static void __exit intel_rapl_exit(void)
|
||||
{
|
||||
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
|
||||
perf_pmu_unregister(&rapl_pmus->pmu);
|
||||
cleanup_rapl_pmus();
|
||||
}
|
||||
--- a/include/linux/cpuhotplug.h
|
||||
+++ b/include/linux/cpuhotplug.h
|
||||
@@ -207,7 +207,6 @@ enum cpuhp_state {
|
||||
CPUHP_AP_PERF_X86_UNCORE_ONLINE,
|
||||
CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
|
||||
CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
|
||||
- CPUHP_AP_PERF_X86_RAPL_ONLINE,
|
||||
CPUHP_AP_PERF_S390_CF_ONLINE,
|
||||
CPUHP_AP_PERF_S390_SF_ONLINE,
|
||||
CPUHP_AP_PERF_ARM_CCI_ONLINE,
|
101
debian/patches/patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch
vendored
Normal file
101
debian/patches/patchset-pf/amd-rapl/0008-perf-x86-rapl-Fix-the-energy-pkg-event-for-AMD-CPUs.patch
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
From f1525664ff9da3241b3556594dc0b67506ae1ddd Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Tue, 10 Sep 2024 14:25:05 +0530
|
||||
Subject: perf/x86/rapl: Fix the energy-pkg event for AMD CPUs
|
||||
|
||||
After commit ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf"),
|
||||
on AMD processors that support extended CPUID leaf 0x80000026, the
|
||||
topology_die_cpumask() and topology_logical_die_id() macros, no longer
|
||||
return the package cpumask and package id, instead they return the CCD
|
||||
(Core Complex Die) mask and id respectively. This leads to the energy-pkg
|
||||
event scope to be modified to CCD instead of package.
|
||||
|
||||
So, change the PMU scope for AMD and Hygon back to package.
|
||||
|
||||
On a 12 CCD 1 Package AMD Zen4 Genoa machine:
|
||||
|
||||
Before:
|
||||
$ cat /sys/devices/power/cpumask
|
||||
0,8,16,24,32,40,48,56,64,72,80,88.
|
||||
|
||||
The expected cpumask here is supposed to be just "0", as it is a package
|
||||
scope event, only one CPU will be collecting the event for all the CPUs in
|
||||
the package.
|
||||
|
||||
After:
|
||||
$ cat /sys/devices/power/cpumask
|
||||
0
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 35 ++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 32 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -139,9 +139,32 @@ static unsigned int rapl_cntr_mask;
|
||||
static u64 rapl_timer_ms;
|
||||
static struct perf_msr *rapl_msrs;
|
||||
|
||||
+/*
|
||||
+ * RAPL Package energy counter scope:
|
||||
+ * 1. AMD/HYGON platforms have a per-PKG package energy counter
|
||||
+ * 2. For Intel platforms
|
||||
+ * 2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope
|
||||
+ * 2.2. Other Intel platforms are single die systems so the scope can be
|
||||
+ * considered as either pkg-scope or die-scope, and we are considering
|
||||
+ * them as die-scope.
|
||||
+ */
|
||||
+#define rapl_pmu_is_pkg_scope() \
|
||||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
|
||||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
|
||||
+
|
||||
+/*
|
||||
+ * Helper function to get the correct topology id according to the
|
||||
+ * RAPL PMU scope.
|
||||
+ */
|
||||
+static inline unsigned int get_rapl_pmu_idx(int cpu)
|
||||
+{
|
||||
+ return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
|
||||
+ topology_logical_die_id(cpu);
|
||||
+}
|
||||
+
|
||||
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
|
||||
{
|
||||
- unsigned int rapl_pmu_idx = topology_logical_die_id(cpu);
|
||||
+ unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
|
||||
|
||||
/*
|
||||
* The unsigned check also catches the '-1' return value for non
|
||||
@@ -617,7 +640,7 @@ static void __init init_rapl_pmu(void)
|
||||
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
rapl_hrtimer_init(pmu);
|
||||
|
||||
- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
|
||||
+ rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu;
|
||||
}
|
||||
|
||||
cpus_read_unlock();
|
||||
@@ -626,6 +649,12 @@ static void __init init_rapl_pmu(void)
|
||||
static int __init init_rapl_pmus(void)
|
||||
{
|
||||
int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
|
||||
+ int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
|
||||
+
|
||||
+ if (rapl_pmu_is_pkg_scope()) {
|
||||
+ nr_rapl_pmu = topology_max_packages();
|
||||
+ rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
+ }
|
||||
|
||||
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
|
||||
if (!rapl_pmus)
|
||||
@@ -641,8 +670,8 @@ static int __init init_rapl_pmus(void)
|
||||
rapl_pmus->pmu.start = rapl_pmu_event_start;
|
||||
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
|
||||
rapl_pmus->pmu.read = rapl_pmu_event_read;
|
||||
+ rapl_pmus->pmu.scope = rapl_pmu_scope;
|
||||
rapl_pmus->pmu.module = THIS_MODULE;
|
||||
- rapl_pmus->pmu.scope = PERF_PMU_SCOPE_DIE;
|
||||
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
|
||||
|
||||
init_rapl_pmu();
|
77
debian/patches/patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch
vendored
Normal file
77
debian/patches/patchset-pf/amd-rapl/0009-x86-topology-Introduce-topology_logical_core_id.patch
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
From 9439067951f4d857272836b35812af26650d9c16 Mon Sep 17 00:00:00 2001
|
||||
From: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
Date: Fri, 13 Sep 2024 15:21:41 +0000
|
||||
Subject: x86/topology: Introduce topology_logical_core_id()
|
||||
|
||||
On x86, topology_core_id() returns a unique core ID within the PKG
|
||||
domain. Looking at match_smt() suggests that a core ID just needs to be
|
||||
unique within a LLC domain. For use cases such as the per-core RAPL PMU,
|
||||
there exists a need for a unique core ID across the entire system with
|
||||
multiple PKG domains. Introduce topology_logical_core_id() to derive a
|
||||
unique core ID across the system.
|
||||
|
||||
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
|
||||
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
|
||||
---
|
||||
Documentation/arch/x86/topology.rst | 4 ++++
|
||||
arch/x86/include/asm/processor.h | 1 +
|
||||
arch/x86/include/asm/topology.h | 1 +
|
||||
arch/x86/kernel/cpu/debugfs.c | 1 +
|
||||
arch/x86/kernel/cpu/topology_common.c | 1 +
|
||||
5 files changed, 8 insertions(+)
|
||||
|
||||
--- a/Documentation/arch/x86/topology.rst
|
||||
+++ b/Documentation/arch/x86/topology.rst
|
||||
@@ -135,6 +135,10 @@ Thread-related topology information in t
|
||||
The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
|
||||
"core_id."
|
||||
|
||||
+ - topology_logical_core_id();
|
||||
+
|
||||
+ The logical core ID to which a thread belongs.
|
||||
+
|
||||
|
||||
|
||||
System topology examples
|
||||
--- a/arch/x86/include/asm/processor.h
|
||||
+++ b/arch/x86/include/asm/processor.h
|
||||
@@ -98,6 +98,7 @@ struct cpuinfo_topology {
|
||||
// Logical ID mappings
|
||||
u32 logical_pkg_id;
|
||||
u32 logical_die_id;
|
||||
+ u32 logical_core_id;
|
||||
|
||||
// AMD Node ID and Nodes per Package info
|
||||
u32 amd_node_id;
|
||||
--- a/arch/x86/include/asm/topology.h
|
||||
+++ b/arch/x86/include/asm/topology.h
|
||||
@@ -137,6 +137,7 @@ extern const struct cpumask *cpu_cluster
|
||||
#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
|
||||
#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
|
||||
#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
|
||||
+#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id)
|
||||
#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
|
||||
#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
|
||||
#define topology_ppin(cpu) (cpu_data(cpu).ppin)
|
||||
--- a/arch/x86/kernel/cpu/debugfs.c
|
||||
+++ b/arch/x86/kernel/cpu/debugfs.c
|
||||
@@ -24,6 +24,7 @@ static int cpu_debug_show(struct seq_fil
|
||||
seq_printf(m, "core_id: %u\n", c->topo.core_id);
|
||||
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
|
||||
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
|
||||
+ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
|
||||
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
|
||||
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
|
||||
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
|
||||
--- a/arch/x86/kernel/cpu/topology_common.c
|
||||
+++ b/arch/x86/kernel/cpu/topology_common.c
|
||||
@@ -151,6 +151,7 @@ static void topo_set_ids(struct topo_sca
|
||||
if (!early) {
|
||||
c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
|
||||
c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
|
||||
+ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
|
||||
}
|
||||
|
||||
/* Package relative core ID */
|
87
debian/patches/patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch
vendored
Normal file
87
debian/patches/patchset-pf/amd-rapl/0010-perf-x86-rapl-Remove-the-cpu_to_rapl_pmu-function.patch
vendored
Normal file
@@ -0,0 +1,87 @@
|
||||
From b8e1231d5f78314de8f9066baba7b1fdd5e59218 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Fri, 13 Sep 2024 15:21:42 +0000
|
||||
Subject: perf/x86/rapl: Remove the cpu_to_rapl_pmu() function
|
||||
|
||||
Preparation for the addition of per-core RAPL energy counter support for
|
||||
AMD CPUs. Post which, one cpu might be mapped to more than one rapl_pmu
|
||||
(package/die one or per-core one), also makes sense to use the
|
||||
get_rapl_pmu_idx macro which is anyway used to index into the
|
||||
rapl_pmus->pmus[] array.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 29 +++++++++++++----------------
|
||||
1 file changed, 13 insertions(+), 16 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -162,17 +162,6 @@ static inline unsigned int get_rapl_pmu_
|
||||
topology_logical_die_id(cpu);
|
||||
}
|
||||
|
||||
-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
|
||||
-{
|
||||
- unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
|
||||
-
|
||||
- /*
|
||||
- * The unsigned check also catches the '-1' return value for non
|
||||
- * existent mappings in the topology map.
|
||||
- */
|
||||
- return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
|
||||
-}
|
||||
-
|
||||
static inline u64 rapl_read_counter(struct perf_event *event)
|
||||
{
|
||||
u64 raw;
|
||||
@@ -348,7 +337,7 @@ static void rapl_pmu_event_del(struct pe
|
||||
static int rapl_pmu_event_init(struct perf_event *event)
|
||||
{
|
||||
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
|
||||
- int bit, ret = 0;
|
||||
+ int bit, rapl_pmu_idx, ret = 0;
|
||||
struct rapl_pmu *pmu;
|
||||
|
||||
/* only look at RAPL events */
|
||||
@@ -376,8 +365,12 @@ static int rapl_pmu_event_init(struct pe
|
||||
if (event->attr.sample_period) /* no sampling */
|
||||
return -EINVAL;
|
||||
|
||||
+ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
|
||||
+ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
/* must be done before validate_group */
|
||||
- pmu = cpu_to_rapl_pmu(event->cpu);
|
||||
+ pmu = rapl_pmus->pmus[rapl_pmu_idx];
|
||||
if (!pmu)
|
||||
return -EINVAL;
|
||||
event->pmu_private = pmu;
|
||||
@@ -623,12 +616,16 @@ static const struct attribute_group *rap
|
||||
static void __init init_rapl_pmu(void)
|
||||
{
|
||||
struct rapl_pmu *pmu;
|
||||
- int cpu;
|
||||
+ int cpu, rapl_pmu_idx;
|
||||
|
||||
cpus_read_lock();
|
||||
|
||||
for_each_cpu(cpu, cpu_online_mask) {
|
||||
- pmu = cpu_to_rapl_pmu(cpu);
|
||||
+ rapl_pmu_idx = get_rapl_pmu_idx(cpu);
|
||||
+ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
|
||||
+ continue;
|
||||
+
|
||||
+ pmu = rapl_pmus->pmus[rapl_pmu_idx];
|
||||
if (pmu)
|
||||
continue;
|
||||
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
|
||||
@@ -640,7 +637,7 @@ static void __init init_rapl_pmu(void)
|
||||
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
rapl_hrtimer_init(pmu);
|
||||
|
||||
- rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu;
|
||||
+ rapl_pmus->pmus[rapl_pmu_idx] = pmu;
|
||||
}
|
||||
|
||||
cpus_read_unlock();
|
240
debian/patches/patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch
vendored
Normal file
240
debian/patches/patchset-pf/amd-rapl/0011-perf-x86-rapl-Rename-rapl_pmu-variables.patch
vendored
Normal file
@@ -0,0 +1,240 @@
|
||||
From 07ec9f38cac6eb6e5b0b062ef99e9458ba567de8 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Fri, 13 Sep 2024 15:21:43 +0000
|
||||
Subject: perf/x86/rapl: Rename rapl_pmu variables
|
||||
|
||||
Rename struct rapl_pmu variables from "pmu" to "rapl_pmu", to
|
||||
avoid any confusion between the variables of two different
|
||||
structs pmu and rapl_pmu. As rapl_pmu also contains a pointer to
|
||||
struct pmu, which leads to situations in code like pmu->pmu,
|
||||
which is needlessly confusing. Above scenario is replaced with
|
||||
much more readable rapl_pmu->pmu with this change.
|
||||
|
||||
Also rename "pmus" member in rapl_pmus struct, for same reason.
|
||||
|
||||
No functional change.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 93 +++++++++++++++++++++---------------------
|
||||
1 file changed, 47 insertions(+), 46 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -116,7 +116,7 @@ struct rapl_pmu {
|
||||
struct rapl_pmus {
|
||||
struct pmu pmu;
|
||||
unsigned int nr_rapl_pmu;
|
||||
- struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
|
||||
+ struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
|
||||
};
|
||||
|
||||
enum rapl_unit_quirk {
|
||||
@@ -223,34 +223,34 @@ static void rapl_start_hrtimer(struct ra
|
||||
|
||||
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
|
||||
{
|
||||
- struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
|
||||
+ struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
|
||||
struct perf_event *event;
|
||||
unsigned long flags;
|
||||
|
||||
- if (!pmu->n_active)
|
||||
+ if (!rapl_pmu->n_active)
|
||||
return HRTIMER_NORESTART;
|
||||
|
||||
- raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
|
||||
- list_for_each_entry(event, &pmu->active_list, active_entry)
|
||||
+ list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
|
||||
rapl_event_update(event);
|
||||
|
||||
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
|
||||
- hrtimer_forward_now(hrtimer, pmu->timer_interval);
|
||||
+ hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
|
||||
|
||||
return HRTIMER_RESTART;
|
||||
}
|
||||
|
||||
-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
|
||||
+static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
|
||||
{
|
||||
- struct hrtimer *hr = &pmu->hrtimer;
|
||||
+ struct hrtimer *hr = &rapl_pmu->hrtimer;
|
||||
|
||||
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
hr->function = rapl_hrtimer_handle;
|
||||
}
|
||||
|
||||
-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
|
||||
+static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
|
||||
struct perf_event *event)
|
||||
{
|
||||
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
|
||||
@@ -258,39 +258,39 @@ static void __rapl_pmu_event_start(struc
|
||||
|
||||
event->hw.state = 0;
|
||||
|
||||
- list_add_tail(&event->active_entry, &pmu->active_list);
|
||||
+ list_add_tail(&event->active_entry, &rapl_pmu->active_list);
|
||||
|
||||
local64_set(&event->hw.prev_count, rapl_read_counter(event));
|
||||
|
||||
- pmu->n_active++;
|
||||
- if (pmu->n_active == 1)
|
||||
- rapl_start_hrtimer(pmu);
|
||||
+ rapl_pmu->n_active++;
|
||||
+ if (rapl_pmu->n_active == 1)
|
||||
+ rapl_start_hrtimer(rapl_pmu);
|
||||
}
|
||||
|
||||
static void rapl_pmu_event_start(struct perf_event *event, int mode)
|
||||
{
|
||||
- struct rapl_pmu *pmu = event->pmu_private;
|
||||
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
|
||||
unsigned long flags;
|
||||
|
||||
- raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
- __rapl_pmu_event_start(pmu, event);
|
||||
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
+ __rapl_pmu_event_start(rapl_pmu, event);
|
||||
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
}
|
||||
|
||||
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
|
||||
{
|
||||
- struct rapl_pmu *pmu = event->pmu_private;
|
||||
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
unsigned long flags;
|
||||
|
||||
- raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
|
||||
/* mark event as deactivated and stopped */
|
||||
if (!(hwc->state & PERF_HES_STOPPED)) {
|
||||
- WARN_ON_ONCE(pmu->n_active <= 0);
|
||||
- pmu->n_active--;
|
||||
- if (pmu->n_active == 0)
|
||||
- hrtimer_cancel(&pmu->hrtimer);
|
||||
+ WARN_ON_ONCE(rapl_pmu->n_active <= 0);
|
||||
+ rapl_pmu->n_active--;
|
||||
+ if (rapl_pmu->n_active == 0)
|
||||
+ hrtimer_cancel(&rapl_pmu->hrtimer);
|
||||
|
||||
list_del(&event->active_entry);
|
||||
|
||||
@@ -308,23 +308,23 @@ static void rapl_pmu_event_stop(struct p
|
||||
hwc->state |= PERF_HES_UPTODATE;
|
||||
}
|
||||
|
||||
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
}
|
||||
|
||||
static int rapl_pmu_event_add(struct perf_event *event, int mode)
|
||||
{
|
||||
- struct rapl_pmu *pmu = event->pmu_private;
|
||||
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
unsigned long flags;
|
||||
|
||||
- raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
|
||||
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
|
||||
|
||||
if (mode & PERF_EF_START)
|
||||
- __rapl_pmu_event_start(pmu, event);
|
||||
+ __rapl_pmu_event_start(rapl_pmu, event);
|
||||
|
||||
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -338,7 +338,7 @@ static int rapl_pmu_event_init(struct pe
|
||||
{
|
||||
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
|
||||
int bit, rapl_pmu_idx, ret = 0;
|
||||
- struct rapl_pmu *pmu;
|
||||
+ struct rapl_pmu *rapl_pmu;
|
||||
|
||||
/* only look at RAPL events */
|
||||
if (event->attr.type != rapl_pmus->pmu.type)
|
||||
@@ -370,10 +370,11 @@ static int rapl_pmu_event_init(struct pe
|
||||
return -EINVAL;
|
||||
|
||||
/* must be done before validate_group */
|
||||
- pmu = rapl_pmus->pmus[rapl_pmu_idx];
|
||||
- if (!pmu)
|
||||
+ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
|
||||
+ if (!rapl_pmu)
|
||||
return -EINVAL;
|
||||
- event->pmu_private = pmu;
|
||||
+
|
||||
+ event->pmu_private = rapl_pmu;
|
||||
event->hw.event_base = rapl_msrs[bit].msr;
|
||||
event->hw.config = cfg;
|
||||
event->hw.idx = bit;
|
||||
@@ -600,7 +601,7 @@ static void cleanup_rapl_pmus(void)
|
||||
int i;
|
||||
|
||||
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
|
||||
- kfree(rapl_pmus->pmus[i]);
|
||||
+ kfree(rapl_pmus->rapl_pmu[i]);
|
||||
kfree(rapl_pmus);
|
||||
}
|
||||
|
||||
@@ -615,7 +616,7 @@ static const struct attribute_group *rap
|
||||
|
||||
static void __init init_rapl_pmu(void)
|
||||
{
|
||||
- struct rapl_pmu *pmu;
|
||||
+ struct rapl_pmu *rapl_pmu;
|
||||
int cpu, rapl_pmu_idx;
|
||||
|
||||
cpus_read_lock();
|
||||
@@ -625,19 +626,19 @@ static void __init init_rapl_pmu(void)
|
||||
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
|
||||
continue;
|
||||
|
||||
- pmu = rapl_pmus->pmus[rapl_pmu_idx];
|
||||
- if (pmu)
|
||||
+ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
|
||||
+ if (rapl_pmu)
|
||||
continue;
|
||||
- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
|
||||
- if (!pmu)
|
||||
+ rapl_pmu = kzalloc_node(sizeof(*rapl_pmu), GFP_KERNEL, cpu_to_node(cpu));
|
||||
+ if (!rapl_pmu)
|
||||
continue;
|
||||
- raw_spin_lock_init(&pmu->lock);
|
||||
- INIT_LIST_HEAD(&pmu->active_list);
|
||||
- pmu->pmu = &rapl_pmus->pmu;
|
||||
- pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
- rapl_hrtimer_init(pmu);
|
||||
+ raw_spin_lock_init(&rapl_pmu->lock);
|
||||
+ INIT_LIST_HEAD(&rapl_pmu->active_list);
|
||||
+ rapl_pmu->pmu = &rapl_pmus->pmu;
|
||||
+ rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
+ rapl_hrtimer_init(rapl_pmu);
|
||||
|
||||
- rapl_pmus->pmus[rapl_pmu_idx] = pmu;
|
||||
+ rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu;
|
||||
}
|
||||
|
||||
cpus_read_unlock();
|
||||
@@ -653,7 +654,7 @@ static int __init init_rapl_pmus(void)
|
||||
rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
}
|
||||
|
||||
- rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
|
||||
+ rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
|
||||
if (!rapl_pmus)
|
||||
return -ENOMEM;
|
||||
|
75
debian/patches/patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch
vendored
Normal file
75
debian/patches/patchset-pf/amd-rapl/0012-perf-x86-rapl-Make-rapl_model-struct-global.patch
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
From 68614752b9fd6b6bae6f9ab7b02fc28350c5a541 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Fri, 13 Sep 2024 15:47:56 +0000
|
||||
Subject: perf/x86/rapl: Make rapl_model struct global
|
||||
|
||||
Preparation for per-core energy counter support addition for AMD CPUs.
|
||||
|
||||
As there will always be just one rapl_model variable on a system, make it
|
||||
global, to make it easier to access it from any function.
|
||||
|
||||
No functional change.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 16 ++++++++--------
|
||||
1 file changed, 8 insertions(+), 8 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -138,6 +138,7 @@ static struct rapl_pmus *rapl_pmus;
|
||||
static unsigned int rapl_cntr_mask;
|
||||
static u64 rapl_timer_ms;
|
||||
static struct perf_msr *rapl_msrs;
|
||||
+static struct rapl_model *rapl_model;
|
||||
|
||||
/*
|
||||
* RAPL Package energy counter scope:
|
||||
@@ -536,18 +537,18 @@ static struct perf_msr amd_rapl_msrs[] =
|
||||
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
|
||||
};
|
||||
|
||||
-static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
+static int rapl_check_hw_unit(void)
|
||||
{
|
||||
u64 msr_rapl_power_unit_bits;
|
||||
int i;
|
||||
|
||||
/* protect rdmsrl() to handle virtualization */
|
||||
- if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
|
||||
+ if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
|
||||
return -1;
|
||||
for (i = 0; i < NR_RAPL_DOMAINS; i++)
|
||||
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
|
||||
- switch (rm->unit_quirk) {
|
||||
+ switch (rapl_model->unit_quirk) {
|
||||
/*
|
||||
* DRAM domain on HSW server and KNL has fixed energy unit which can be
|
||||
* different than the unit from power unit MSR. See
|
||||
@@ -798,21 +799,20 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
|
||||
static int __init rapl_pmu_init(void)
|
||||
{
|
||||
const struct x86_cpu_id *id;
|
||||
- struct rapl_model *rm;
|
||||
int ret;
|
||||
|
||||
id = x86_match_cpu(rapl_model_match);
|
||||
if (!id)
|
||||
return -ENODEV;
|
||||
|
||||
- rm = (struct rapl_model *) id->driver_data;
|
||||
+ rapl_model = (struct rapl_model *) id->driver_data;
|
||||
|
||||
- rapl_msrs = rm->rapl_msrs;
|
||||
+ rapl_msrs = rapl_model->rapl_msrs;
|
||||
|
||||
rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
|
||||
- false, (void *) &rm->events);
|
||||
+ false, (void *) &rapl_model->events);
|
||||
|
||||
- ret = rapl_check_hw_unit(rm);
|
||||
+ ret = rapl_check_hw_unit();
|
||||
if (ret)
|
||||
return ret;
|
||||
|
112
debian/patches/patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch
vendored
Normal file
112
debian/patches/patchset-pf/amd-rapl/0013-perf-x86-rapl-Add-arguments-to-the-cleanup-and-init-.patch
vendored
Normal file
@@ -0,0 +1,112 @@
|
||||
From b10b887510ccb0b6bc7294888982b862703c9c32 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Fri, 13 Sep 2024 15:47:57 +0000
|
||||
Subject: perf/x86/rapl: Add arguments to the cleanup and init functions
|
||||
|
||||
Prep for per-core RAPL PMU addition.
|
||||
|
||||
No functional change.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 32 +++++++++++++++++++-------------
|
||||
1 file changed, 19 insertions(+), 13 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -597,7 +597,7 @@ static void __init rapl_advertise(void)
|
||||
}
|
||||
}
|
||||
|
||||
-static void cleanup_rapl_pmus(void)
|
||||
+static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
|
||||
{
|
||||
int i;
|
||||
|
||||
@@ -615,7 +615,7 @@ static const struct attribute_group *rap
|
||||
NULL,
|
||||
};
|
||||
|
||||
-static void __init init_rapl_pmu(void)
|
||||
+static void __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
|
||||
{
|
||||
struct rapl_pmu *rapl_pmu;
|
||||
int cpu, rapl_pmu_idx;
|
||||
@@ -645,20 +645,22 @@ static void __init init_rapl_pmu(void)
|
||||
cpus_read_unlock();
|
||||
}
|
||||
|
||||
-static int __init init_rapl_pmus(void)
|
||||
+static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope)
|
||||
{
|
||||
- int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
|
||||
- int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
|
||||
+ int nr_rapl_pmu;
|
||||
+ struct rapl_pmus *rapl_pmus;
|
||||
|
||||
- if (rapl_pmu_is_pkg_scope()) {
|
||||
- nr_rapl_pmu = topology_max_packages();
|
||||
- rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
- }
|
||||
+ if (rapl_pmu_scope == PERF_PMU_SCOPE_PKG)
|
||||
+ nr_rapl_pmu = topology_max_packages();
|
||||
+ else
|
||||
+ nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
|
||||
|
||||
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
|
||||
if (!rapl_pmus)
|
||||
return -ENOMEM;
|
||||
|
||||
+ *rapl_pmus_ptr = rapl_pmus;
|
||||
+
|
||||
rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
|
||||
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
|
||||
rapl_pmus->pmu.attr_update = rapl_attr_update;
|
||||
@@ -673,7 +675,7 @@ static int __init init_rapl_pmus(void)
|
||||
rapl_pmus->pmu.module = THIS_MODULE;
|
||||
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
|
||||
|
||||
- init_rapl_pmu();
|
||||
+ init_rapl_pmu(rapl_pmus);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -799,8 +801,12 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
|
||||
static int __init rapl_pmu_init(void)
|
||||
{
|
||||
const struct x86_cpu_id *id;
|
||||
+ int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
|
||||
int ret;
|
||||
|
||||
+ if (rapl_pmu_is_pkg_scope())
|
||||
+ rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
+
|
||||
id = x86_match_cpu(rapl_model_match);
|
||||
if (!id)
|
||||
return -ENODEV;
|
||||
@@ -816,7 +822,7 @@ static int __init rapl_pmu_init(void)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
- ret = init_rapl_pmus();
|
||||
+ ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -829,7 +835,7 @@ static int __init rapl_pmu_init(void)
|
||||
|
||||
out:
|
||||
pr_warn("Initialization failed (%d), disabled\n", ret);
|
||||
- cleanup_rapl_pmus();
|
||||
+ cleanup_rapl_pmus(rapl_pmus);
|
||||
return ret;
|
||||
}
|
||||
module_init(rapl_pmu_init);
|
||||
@@ -837,6 +843,6 @@ module_init(rapl_pmu_init);
|
||||
static void __exit intel_rapl_exit(void)
|
||||
{
|
||||
perf_pmu_unregister(&rapl_pmus->pmu);
|
||||
- cleanup_rapl_pmus();
|
||||
+ cleanup_rapl_pmus(rapl_pmus);
|
||||
}
|
||||
module_exit(intel_rapl_exit);
|
358
debian/patches/patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch
vendored
Normal file
358
debian/patches/patchset-pf/amd-rapl/0014-perf-x86-rapl-Modify-the-generic-variable-names-to-_.patch
vendored
Normal file
@@ -0,0 +1,358 @@
|
||||
From b5c83c40540298a39f8314034b705f1236b17a9f Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Fri, 13 Sep 2024 15:47:58 +0000
|
||||
Subject: perf/x86/rapl: Modify the generic variable names to *_pkg*
|
||||
|
||||
Prep for addition of power_per_core PMU to handle core scope energy
|
||||
consumption for AMD CPUs.
|
||||
|
||||
Replace the generic names with *_pkg*, to differentiate between the
|
||||
scopes of the two different PMUs and their variables.
|
||||
|
||||
No functional change.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 118 ++++++++++++++++++++---------------------
|
||||
1 file changed, 59 insertions(+), 59 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -70,18 +70,18 @@ MODULE_LICENSE("GPL");
|
||||
/*
|
||||
* RAPL energy status counters
|
||||
*/
|
||||
-enum perf_rapl_events {
|
||||
+enum perf_rapl_pkg_events {
|
||||
PERF_RAPL_PP0 = 0, /* all cores */
|
||||
PERF_RAPL_PKG, /* entire package */
|
||||
PERF_RAPL_RAM, /* DRAM */
|
||||
PERF_RAPL_PP1, /* gpu */
|
||||
PERF_RAPL_PSYS, /* psys */
|
||||
|
||||
- PERF_RAPL_MAX,
|
||||
- NR_RAPL_DOMAINS = PERF_RAPL_MAX,
|
||||
+ PERF_RAPL_PKG_EVENTS_MAX,
|
||||
+ NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
|
||||
};
|
||||
|
||||
-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
|
||||
+static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
|
||||
"pp0-core",
|
||||
"package",
|
||||
"dram",
|
||||
@@ -126,16 +126,16 @@ enum rapl_unit_quirk {
|
||||
};
|
||||
|
||||
struct rapl_model {
|
||||
- struct perf_msr *rapl_msrs;
|
||||
- unsigned long events;
|
||||
+ struct perf_msr *rapl_pkg_msrs;
|
||||
+ unsigned long pkg_events;
|
||||
unsigned int msr_power_unit;
|
||||
enum rapl_unit_quirk unit_quirk;
|
||||
};
|
||||
|
||||
/* 1/2^hw_unit Joule */
|
||||
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
|
||||
-static struct rapl_pmus *rapl_pmus;
|
||||
-static unsigned int rapl_cntr_mask;
|
||||
+static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
|
||||
+static struct rapl_pmus *rapl_pmus_pkg;
|
||||
+static unsigned int rapl_pkg_cntr_mask;
|
||||
static u64 rapl_timer_ms;
|
||||
static struct perf_msr *rapl_msrs;
|
||||
static struct rapl_model *rapl_model;
|
||||
@@ -149,7 +149,7 @@ static struct rapl_model *rapl_model;
|
||||
* considered as either pkg-scope or die-scope, and we are considering
|
||||
* them as die-scope.
|
||||
*/
|
||||
-#define rapl_pmu_is_pkg_scope() \
|
||||
+#define rapl_pkg_pmu_is_pkg_scope() \
|
||||
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
|
||||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
|
||||
|
||||
@@ -159,7 +159,7 @@ static struct rapl_model *rapl_model;
|
||||
*/
|
||||
static inline unsigned int get_rapl_pmu_idx(int cpu)
|
||||
{
|
||||
- return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
|
||||
+ return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
|
||||
topology_logical_die_id(cpu);
|
||||
}
|
||||
|
||||
@@ -172,7 +172,7 @@ static inline u64 rapl_read_counter(stru
|
||||
|
||||
static inline u64 rapl_scale(u64 v, int cfg)
|
||||
{
|
||||
- if (cfg > NR_RAPL_DOMAINS) {
|
||||
+ if (cfg > NR_RAPL_PKG_DOMAINS) {
|
||||
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
|
||||
return v;
|
||||
}
|
||||
@@ -182,7 +182,7 @@ static inline u64 rapl_scale(u64 v, int
|
||||
* or use ldexp(count, -32).
|
||||
* Watts = Joules/Time delta
|
||||
*/
|
||||
- return v << (32 - rapl_hw_unit[cfg - 1]);
|
||||
+ return v << (32 - rapl_pkg_hw_unit[cfg - 1]);
|
||||
}
|
||||
|
||||
static u64 rapl_event_update(struct perf_event *event)
|
||||
@@ -342,7 +342,7 @@ static int rapl_pmu_event_init(struct pe
|
||||
struct rapl_pmu *rapl_pmu;
|
||||
|
||||
/* only look at RAPL events */
|
||||
- if (event->attr.type != rapl_pmus->pmu.type)
|
||||
+ if (event->attr.type != rapl_pmus_pkg->pmu.type)
|
||||
return -ENOENT;
|
||||
|
||||
/* check only supported bits are set */
|
||||
@@ -352,14 +352,14 @@ static int rapl_pmu_event_init(struct pe
|
||||
if (event->cpu < 0)
|
||||
return -EINVAL;
|
||||
|
||||
- if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
|
||||
+ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
|
||||
return -EINVAL;
|
||||
|
||||
- cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
|
||||
+ cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
|
||||
bit = cfg - 1;
|
||||
|
||||
/* check event supported */
|
||||
- if (!(rapl_cntr_mask & (1 << bit)))
|
||||
+ if (!(rapl_pkg_cntr_mask & (1 << bit)))
|
||||
return -EINVAL;
|
||||
|
||||
/* unsupported modes and filters */
|
||||
@@ -367,11 +367,11 @@ static int rapl_pmu_event_init(struct pe
|
||||
return -EINVAL;
|
||||
|
||||
rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
|
||||
- if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
|
||||
+ if (rapl_pmu_idx >= rapl_pmus_pkg->nr_rapl_pmu)
|
||||
return -EINVAL;
|
||||
|
||||
/* must be done before validate_group */
|
||||
- rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
|
||||
+ rapl_pmu = rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx];
|
||||
if (!rapl_pmu)
|
||||
return -EINVAL;
|
||||
|
||||
@@ -525,11 +525,11 @@ static struct perf_msr intel_rapl_spr_ms
|
||||
};
|
||||
|
||||
/*
|
||||
- * Force to PERF_RAPL_MAX size due to:
|
||||
- * - perf_msr_probe(PERF_RAPL_MAX)
|
||||
+ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
|
||||
+ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
|
||||
* - want to use same event codes across both architectures
|
||||
*/
|
||||
-static struct perf_msr amd_rapl_msrs[] = {
|
||||
+static struct perf_msr amd_rapl_pkg_msrs[] = {
|
||||
[PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
|
||||
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
|
||||
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
|
||||
@@ -545,8 +545,8 @@ static int rapl_check_hw_unit(void)
|
||||
/* protect rdmsrl() to handle virtualization */
|
||||
if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
|
||||
return -1;
|
||||
- for (i = 0; i < NR_RAPL_DOMAINS; i++)
|
||||
- rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
+ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
|
||||
+ rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
|
||||
switch (rapl_model->unit_quirk) {
|
||||
/*
|
||||
@@ -556,11 +556,11 @@ static int rapl_check_hw_unit(void)
|
||||
* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
|
||||
*/
|
||||
case RAPL_UNIT_QUIRK_INTEL_HSW:
|
||||
- rapl_hw_unit[PERF_RAPL_RAM] = 16;
|
||||
+ rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
|
||||
break;
|
||||
/* SPR uses a fixed energy unit for Psys domain. */
|
||||
case RAPL_UNIT_QUIRK_INTEL_SPR:
|
||||
- rapl_hw_unit[PERF_RAPL_PSYS] = 0;
|
||||
+ rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -575,9 +575,9 @@ static int rapl_check_hw_unit(void)
|
||||
* if hw unit is 32, then we use 2 ms 1/200/2
|
||||
*/
|
||||
rapl_timer_ms = 2;
|
||||
- if (rapl_hw_unit[0] < 32) {
|
||||
+ if (rapl_pkg_hw_unit[0] < 32) {
|
||||
rapl_timer_ms = (1000 / (2 * 100));
|
||||
- rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
|
||||
+ rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -587,12 +587,12 @@ static void __init rapl_advertise(void)
|
||||
int i;
|
||||
|
||||
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
|
||||
- hweight32(rapl_cntr_mask), rapl_timer_ms);
|
||||
+ hweight32(rapl_pkg_cntr_mask), rapl_timer_ms);
|
||||
|
||||
- for (i = 0; i < NR_RAPL_DOMAINS; i++) {
|
||||
- if (rapl_cntr_mask & (1 << i)) {
|
||||
+ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
|
||||
+ if (rapl_pkg_cntr_mask & (1 << i)) {
|
||||
pr_info("hw unit of domain %s 2^-%d Joules\n",
|
||||
- rapl_domain_names[i], rapl_hw_unit[i]);
|
||||
+ rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -681,71 +681,71 @@ static int __init init_rapl_pmus(struct
|
||||
}
|
||||
|
||||
static struct rapl_model model_snb = {
|
||||
- .events = BIT(PERF_RAPL_PP0) |
|
||||
+ .pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_PP1),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
- .rapl_msrs = intel_rapl_msrs,
|
||||
+ .rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_snbep = {
|
||||
- .events = BIT(PERF_RAPL_PP0) |
|
||||
+ .pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
- .rapl_msrs = intel_rapl_msrs,
|
||||
+ .rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_hsw = {
|
||||
- .events = BIT(PERF_RAPL_PP0) |
|
||||
+ .pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM) |
|
||||
BIT(PERF_RAPL_PP1),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
- .rapl_msrs = intel_rapl_msrs,
|
||||
+ .rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_hsx = {
|
||||
- .events = BIT(PERF_RAPL_PP0) |
|
||||
+ .pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM),
|
||||
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
- .rapl_msrs = intel_rapl_msrs,
|
||||
+ .rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_knl = {
|
||||
- .events = BIT(PERF_RAPL_PKG) |
|
||||
+ .pkg_events = BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM),
|
||||
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
- .rapl_msrs = intel_rapl_msrs,
|
||||
+ .rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_skl = {
|
||||
- .events = BIT(PERF_RAPL_PP0) |
|
||||
+ .pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM) |
|
||||
BIT(PERF_RAPL_PP1) |
|
||||
BIT(PERF_RAPL_PSYS),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
- .rapl_msrs = intel_rapl_msrs,
|
||||
+ .rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_spr = {
|
||||
- .events = BIT(PERF_RAPL_PP0) |
|
||||
+ .pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM) |
|
||||
BIT(PERF_RAPL_PSYS),
|
||||
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
- .rapl_msrs = intel_rapl_spr_msrs,
|
||||
+ .rapl_pkg_msrs = intel_rapl_spr_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_amd_hygon = {
|
||||
- .events = BIT(PERF_RAPL_PKG),
|
||||
+ .pkg_events = BIT(PERF_RAPL_PKG),
|
||||
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
|
||||
- .rapl_msrs = amd_rapl_msrs,
|
||||
+ .rapl_pkg_msrs = amd_rapl_pkg_msrs,
|
||||
};
|
||||
|
||||
static const struct x86_cpu_id rapl_model_match[] __initconst = {
|
||||
@@ -801,11 +801,11 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
|
||||
static int __init rapl_pmu_init(void)
|
||||
{
|
||||
const struct x86_cpu_id *id;
|
||||
- int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
|
||||
+ int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
|
||||
int ret;
|
||||
|
||||
- if (rapl_pmu_is_pkg_scope())
|
||||
- rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
+ if (rapl_pkg_pmu_is_pkg_scope())
|
||||
+ rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
|
||||
id = x86_match_cpu(rapl_model_match);
|
||||
if (!id)
|
||||
@@ -813,20 +813,20 @@ static int __init rapl_pmu_init(void)
|
||||
|
||||
rapl_model = (struct rapl_model *) id->driver_data;
|
||||
|
||||
- rapl_msrs = rapl_model->rapl_msrs;
|
||||
+ rapl_msrs = rapl_model->rapl_pkg_msrs;
|
||||
|
||||
- rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
|
||||
- false, (void *) &rapl_model->events);
|
||||
+ rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX,
|
||||
+ false, (void *) &rapl_model->pkg_events);
|
||||
|
||||
ret = rapl_check_hw_unit();
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
- ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope);
|
||||
+ ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
- ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
|
||||
+ ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -835,14 +835,14 @@ static int __init rapl_pmu_init(void)
|
||||
|
||||
out:
|
||||
pr_warn("Initialization failed (%d), disabled\n", ret);
|
||||
- cleanup_rapl_pmus(rapl_pmus);
|
||||
+ cleanup_rapl_pmus(rapl_pmus_pkg);
|
||||
return ret;
|
||||
}
|
||||
module_init(rapl_pmu_init);
|
||||
|
||||
static void __exit intel_rapl_exit(void)
|
||||
{
|
||||
- perf_pmu_unregister(&rapl_pmus->pmu);
|
||||
- cleanup_rapl_pmus(rapl_pmus);
|
||||
+ perf_pmu_unregister(&rapl_pmus_pkg->pmu);
|
||||
+ cleanup_rapl_pmus(rapl_pmus_pkg);
|
||||
}
|
||||
module_exit(intel_rapl_exit);
|
@@ -0,0 +1,47 @@
|
||||
From dbc0343069c8f86fad0d8d9075f70f79114ef10a Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Fri, 13 Sep 2024 15:47:59 +0000
|
||||
Subject: perf/x86/rapl: Remove the global variable rapl_msrs
|
||||
|
||||
After making the rapl_model struct global, the rapl_msrs global
|
||||
variable isn't needed, so remove it.
|
||||
|
||||
Also it will be cleaner when new per-core scope PMU is added. As we will
|
||||
need to maintain two rapl_msrs array(one for per-core scope and one for
|
||||
package scope PMU), inside the rapl_model struct.
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 7 ++-----
|
||||
1 file changed, 2 insertions(+), 5 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -137,7 +137,6 @@ static int rapl_pkg_hw_unit[NR_RAPL_PKG_
|
||||
static struct rapl_pmus *rapl_pmus_pkg;
|
||||
static unsigned int rapl_pkg_cntr_mask;
|
||||
static u64 rapl_timer_ms;
|
||||
-static struct perf_msr *rapl_msrs;
|
||||
static struct rapl_model *rapl_model;
|
||||
|
||||
/*
|
||||
@@ -376,7 +375,7 @@ static int rapl_pmu_event_init(struct pe
|
||||
return -EINVAL;
|
||||
|
||||
event->pmu_private = rapl_pmu;
|
||||
- event->hw.event_base = rapl_msrs[bit].msr;
|
||||
+ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
|
||||
event->hw.config = cfg;
|
||||
event->hw.idx = bit;
|
||||
|
||||
@@ -813,9 +812,7 @@ static int __init rapl_pmu_init(void)
|
||||
|
||||
rapl_model = (struct rapl_model *) id->driver_data;
|
||||
|
||||
- rapl_msrs = rapl_model->rapl_pkg_msrs;
|
||||
-
|
||||
- rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX,
|
||||
+ rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
|
||||
false, (void *) &rapl_model->pkg_events);
|
||||
|
||||
ret = rapl_check_hw_unit();
|
@@ -0,0 +1,79 @@
|
||||
From d6a5a28382558b896767a78db795d421015831a7 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Fri, 13 Sep 2024 15:48:00 +0000
|
||||
Subject: perf/x86/rapl: Move the cntr_mask to rapl_pmus struct
|
||||
|
||||
Preparation for the addition of per-core RAPL energy counter for AMD
|
||||
CPUs.
|
||||
|
||||
Moving cntr_mask to rapl_pmus struct instead of adding a new global
|
||||
cntr_mask for the per-core RAPL energy counter, will ensure that the
|
||||
"per_core_cntr_mask" is only created if needed (i.e. in case of AMD
|
||||
CPUs).
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 15 ++++++++-------
|
||||
1 file changed, 8 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -116,6 +116,7 @@ struct rapl_pmu {
|
||||
struct rapl_pmus {
|
||||
struct pmu pmu;
|
||||
unsigned int nr_rapl_pmu;
|
||||
+ unsigned int cntr_mask;
|
||||
struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
|
||||
};
|
||||
|
||||
@@ -135,7 +136,6 @@ struct rapl_model {
|
||||
/* 1/2^hw_unit Joule */
|
||||
static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
|
||||
static struct rapl_pmus *rapl_pmus_pkg;
|
||||
-static unsigned int rapl_pkg_cntr_mask;
|
||||
static u64 rapl_timer_ms;
|
||||
static struct rapl_model *rapl_model;
|
||||
|
||||
@@ -358,7 +358,7 @@ static int rapl_pmu_event_init(struct pe
|
||||
bit = cfg - 1;
|
||||
|
||||
/* check event supported */
|
||||
- if (!(rapl_pkg_cntr_mask & (1 << bit)))
|
||||
+ if (!(rapl_pmus_pkg->cntr_mask & (1 << bit)))
|
||||
return -EINVAL;
|
||||
|
||||
/* unsupported modes and filters */
|
||||
@@ -586,10 +586,10 @@ static void __init rapl_advertise(void)
|
||||
int i;
|
||||
|
||||
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
|
||||
- hweight32(rapl_pkg_cntr_mask), rapl_timer_ms);
|
||||
+ hweight32(rapl_pmus_pkg->cntr_mask), rapl_timer_ms);
|
||||
|
||||
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
|
||||
- if (rapl_pkg_cntr_mask & (1 << i)) {
|
||||
+ if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
|
||||
pr_info("hw unit of domain %s 2^-%d Joules\n",
|
||||
rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
|
||||
}
|
||||
@@ -812,9 +812,6 @@ static int __init rapl_pmu_init(void)
|
||||
|
||||
rapl_model = (struct rapl_model *) id->driver_data;
|
||||
|
||||
- rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
|
||||
- false, (void *) &rapl_model->pkg_events);
|
||||
-
|
||||
ret = rapl_check_hw_unit();
|
||||
if (ret)
|
||||
return ret;
|
||||
@@ -823,6 +820,10 @@ static int __init rapl_pmu_init(void)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
+ rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
|
||||
+ PERF_RAPL_PKG_EVENTS_MAX, false,
|
||||
+ (void *) &rapl_model->pkg_events);
|
||||
+
|
||||
ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
|
||||
if (ret)
|
||||
goto out;
|
439
debian/patches/patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch
vendored
Normal file
439
debian/patches/patchset-pf/amd-rapl/0017-perf-x86-rapl-Add-per-core-energy-counter-support-fo.patch
vendored
Normal file
@@ -0,0 +1,439 @@
|
||||
From 3cb480ec2950f4c6351c602552fc4f9a8e524b89 Mon Sep 17 00:00:00 2001
|
||||
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
Date: Fri, 13 Sep 2024 15:48:01 +0000
|
||||
Subject: perf/x86/rapl: Add per-core energy counter support for AMD CPUs
|
||||
|
||||
Add a new "power_per_core" PMU and "energy-per-core" event for
|
||||
monitoring energy consumption by each core. The existing energy-cores
|
||||
event aggregates the energy consumption at the package level.
|
||||
This new event aligns with the AMD's per_core energy counters.
|
||||
|
||||
Tested the package level and core level PMU counters with workloads
|
||||
pinned to different CPUs.
|
||||
|
||||
Results with workload pinned to CPU 1 in core 1 on a AMD Zen4 Genoa
|
||||
machine:
|
||||
|
||||
$ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1
|
||||
|
||||
Performance counter stats for 'system wide':
|
||||
|
||||
S0-D0-C0 1 0.02 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C1 1 5.72 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C2 1 0.02 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C3 1 0.02 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C4 1 0.02 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C5 1 0.02 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C6 1 0.02 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C7 1 0.02 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C8 1 0.02 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C9 1 0.02 Joules power_per_core/energy-per-core/
|
||||
S0-D0-C10 1 0.02 Joules power_per_core/energy-per-core/
|
||||
|
||||
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
|
||||
---
|
||||
arch/x86/events/rapl.c | 178 +++++++++++++++++++++++++++++++++--------
|
||||
1 file changed, 143 insertions(+), 35 deletions(-)
|
||||
|
||||
--- a/arch/x86/events/rapl.c
|
||||
+++ b/arch/x86/events/rapl.c
|
||||
@@ -39,6 +39,10 @@
|
||||
* event: rapl_energy_psys
|
||||
* perf code: 0x5
|
||||
*
|
||||
+ * per_core counter: consumption of a single physical core
|
||||
+ * event: rapl_energy_per_core (power_per_core PMU)
|
||||
+ * perf code: 0x1
|
||||
+ *
|
||||
* We manage those counters as free running (read-only). They may be
|
||||
* use simultaneously by other tools, such as turbostat.
|
||||
*
|
||||
@@ -81,6 +85,10 @@ enum perf_rapl_pkg_events {
|
||||
NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
|
||||
};
|
||||
|
||||
+#define PERF_RAPL_PER_CORE 0 /* per-core */
|
||||
+#define PERF_RAPL_CORE_EVENTS_MAX 1
|
||||
+#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
|
||||
+
|
||||
static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
|
||||
"pp0-core",
|
||||
"package",
|
||||
@@ -89,6 +97,8 @@ static const char *const rapl_pkg_domain
|
||||
"psys",
|
||||
};
|
||||
|
||||
+static const char *const rapl_core_domain_name __initconst = "per-core";
|
||||
+
|
||||
/*
|
||||
* event code: LSB 8 bits, passed in attr->config
|
||||
* any other bit is reserved
|
||||
@@ -128,14 +138,18 @@ enum rapl_unit_quirk {
|
||||
|
||||
struct rapl_model {
|
||||
struct perf_msr *rapl_pkg_msrs;
|
||||
+ struct perf_msr *rapl_core_msrs;
|
||||
unsigned long pkg_events;
|
||||
+ unsigned long core_events;
|
||||
unsigned int msr_power_unit;
|
||||
enum rapl_unit_quirk unit_quirk;
|
||||
};
|
||||
|
||||
/* 1/2^hw_unit Joule */
|
||||
static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
|
||||
+static int rapl_core_hw_unit __read_mostly;
|
||||
static struct rapl_pmus *rapl_pmus_pkg;
|
||||
+static struct rapl_pmus *rapl_pmus_core;
|
||||
static u64 rapl_timer_ms;
|
||||
static struct rapl_model *rapl_model;
|
||||
|
||||
@@ -156,10 +170,14 @@ static struct rapl_model *rapl_model;
|
||||
* Helper function to get the correct topology id according to the
|
||||
* RAPL PMU scope.
|
||||
*/
|
||||
-static inline unsigned int get_rapl_pmu_idx(int cpu)
|
||||
+static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
|
||||
{
|
||||
- return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
|
||||
- topology_logical_die_id(cpu);
|
||||
+ if (scope == PERF_PMU_SCOPE_PKG)
|
||||
+ return topology_logical_package_id(cpu);
|
||||
+ else if (scope == PERF_PMU_SCOPE_DIE)
|
||||
+ return topology_logical_die_id(cpu);
|
||||
+ else
|
||||
+ return topology_logical_core_id(cpu);
|
||||
}
|
||||
|
||||
static inline u64 rapl_read_counter(struct perf_event *event)
|
||||
@@ -169,19 +187,20 @@ static inline u64 rapl_read_counter(stru
|
||||
return raw;
|
||||
}
|
||||
|
||||
-static inline u64 rapl_scale(u64 v, int cfg)
|
||||
+static inline u64 rapl_scale(u64 v, struct perf_event *event)
|
||||
{
|
||||
- if (cfg > NR_RAPL_PKG_DOMAINS) {
|
||||
- pr_warn("Invalid domain %d, failed to scale data\n", cfg);
|
||||
- return v;
|
||||
- }
|
||||
+ int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
|
||||
+
|
||||
+ if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
|
||||
+ hw_unit = rapl_core_hw_unit;
|
||||
+
|
||||
/*
|
||||
* scale delta to smallest unit (1/2^32)
|
||||
* users must then scale back: count * 1/(1e9*2^32) to get Joules
|
||||
* or use ldexp(count, -32).
|
||||
* Watts = Joules/Time delta
|
||||
*/
|
||||
- return v << (32 - rapl_pkg_hw_unit[cfg - 1]);
|
||||
+ return v << (32 - hw_unit);
|
||||
}
|
||||
|
||||
static u64 rapl_event_update(struct perf_event *event)
|
||||
@@ -208,7 +227,7 @@ static u64 rapl_event_update(struct perf
|
||||
delta = (new_raw_count << shift) - (prev_raw_count << shift);
|
||||
delta >>= shift;
|
||||
|
||||
- sdelta = rapl_scale(delta, event->hw.config);
|
||||
+ sdelta = rapl_scale(delta, event);
|
||||
|
||||
local64_add(sdelta, &event->count);
|
||||
|
||||
@@ -337,12 +356,13 @@ static void rapl_pmu_event_del(struct pe
|
||||
static int rapl_pmu_event_init(struct perf_event *event)
|
||||
{
|
||||
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
|
||||
- int bit, rapl_pmu_idx, ret = 0;
|
||||
+ int bit, rapl_pmus_scope, rapl_pmu_idx, ret = 0;
|
||||
struct rapl_pmu *rapl_pmu;
|
||||
+ struct rapl_pmus *rapl_pmus;
|
||||
|
||||
- /* only look at RAPL events */
|
||||
- if (event->attr.type != rapl_pmus_pkg->pmu.type)
|
||||
- return -ENOENT;
|
||||
+ /* unsupported modes and filters */
|
||||
+ if (event->attr.sample_period) /* no sampling */
|
||||
+ return -EINVAL;
|
||||
|
||||
/* check only supported bits are set */
|
||||
if (event->attr.config & ~RAPL_EVENT_MASK)
|
||||
@@ -351,31 +371,49 @@ static int rapl_pmu_event_init(struct pe
|
||||
if (event->cpu < 0)
|
||||
return -EINVAL;
|
||||
|
||||
- if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
|
||||
+ rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
|
||||
+ if (!rapl_pmus)
|
||||
return -EINVAL;
|
||||
+ rapl_pmus_scope = rapl_pmus->pmu.scope;
|
||||
|
||||
- cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
|
||||
- bit = cfg - 1;
|
||||
-
|
||||
- /* check event supported */
|
||||
- if (!(rapl_pmus_pkg->cntr_mask & (1 << bit)))
|
||||
+ if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
|
||||
+ /* only look at RAPL package events */
|
||||
+ if (event->attr.type != rapl_pmus_pkg->pmu.type)
|
||||
+ return -ENOENT;
|
||||
+
|
||||
+ cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
|
||||
+ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ bit = cfg - 1;
|
||||
+ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
|
||||
+ } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
|
||||
+ /* only look at RAPL per-core events */
|
||||
+ if (event->attr.type != rapl_pmus_core->pmu.type)
|
||||
+ return -ENOENT;
|
||||
+
|
||||
+ cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
|
||||
+ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ bit = cfg - 1;
|
||||
+ event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
|
||||
+ } else
|
||||
return -EINVAL;
|
||||
|
||||
- /* unsupported modes and filters */
|
||||
- if (event->attr.sample_period) /* no sampling */
|
||||
+ /* check event supported */
|
||||
+ if (!(rapl_pmus->cntr_mask & (1 << bit)))
|
||||
return -EINVAL;
|
||||
|
||||
- rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
|
||||
- if (rapl_pmu_idx >= rapl_pmus_pkg->nr_rapl_pmu)
|
||||
+ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
|
||||
+ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
|
||||
return -EINVAL;
|
||||
-
|
||||
/* must be done before validate_group */
|
||||
- rapl_pmu = rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx];
|
||||
+ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
|
||||
if (!rapl_pmu)
|
||||
return -EINVAL;
|
||||
|
||||
event->pmu_private = rapl_pmu;
|
||||
- event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
|
||||
event->hw.config = cfg;
|
||||
event->hw.idx = bit;
|
||||
|
||||
@@ -392,12 +430,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl
|
||||
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
|
||||
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
|
||||
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
|
||||
+RAPL_EVENT_ATTR_STR(energy-per-core, rapl_per_core, "event=0x01");
|
||||
|
||||
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
|
||||
+RAPL_EVENT_ATTR_STR(energy-per-core.unit, rapl_per_core_unit, "Joules");
|
||||
|
||||
/*
|
||||
* we compute in 0.23 nJ increments regardless of MSR
|
||||
@@ -407,6 +447,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale,
|
||||
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
|
||||
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
|
||||
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
|
||||
+RAPL_EVENT_ATTR_STR(energy-per-core.scale, rapl_per_core_scale, "2.3283064365386962890625e-10");
|
||||
|
||||
/*
|
||||
* There are no default events, but we need to create
|
||||
@@ -439,6 +480,12 @@ static const struct attribute_group *rap
|
||||
NULL,
|
||||
};
|
||||
|
||||
+static const struct attribute_group *rapl_per_core_attr_groups[] = {
|
||||
+ &rapl_pmu_format_group,
|
||||
+ &rapl_pmu_events_group,
|
||||
+ NULL,
|
||||
+};
|
||||
+
|
||||
static struct attribute *rapl_events_cores[] = {
|
||||
EVENT_PTR(rapl_cores),
|
||||
EVENT_PTR(rapl_cores_unit),
|
||||
@@ -499,6 +546,18 @@ static struct attribute_group rapl_event
|
||||
.attrs = rapl_events_psys,
|
||||
};
|
||||
|
||||
+static struct attribute *rapl_events_per_core[] = {
|
||||
+ EVENT_PTR(rapl_per_core),
|
||||
+ EVENT_PTR(rapl_per_core_unit),
|
||||
+ EVENT_PTR(rapl_per_core_scale),
|
||||
+ NULL,
|
||||
+};
|
||||
+
|
||||
+static struct attribute_group rapl_events_per_core_group = {
|
||||
+ .name = "events",
|
||||
+ .attrs = rapl_events_per_core,
|
||||
+};
|
||||
+
|
||||
static bool test_msr(int idx, void *data)
|
||||
{
|
||||
return test_bit(idx, (unsigned long *) data);
|
||||
@@ -536,6 +595,11 @@ static struct perf_msr amd_rapl_pkg_msrs
|
||||
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
|
||||
};
|
||||
|
||||
+static struct perf_msr amd_rapl_core_msrs[] = {
|
||||
+ [PERF_RAPL_PER_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_per_core_group,
|
||||
+ test_msr, false, RAPL_MSR_MASK },
|
||||
+};
|
||||
+
|
||||
static int rapl_check_hw_unit(void)
|
||||
{
|
||||
u64 msr_rapl_power_unit_bits;
|
||||
@@ -547,6 +611,8 @@ static int rapl_check_hw_unit(void)
|
||||
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
|
||||
rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
|
||||
+ rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
+
|
||||
switch (rapl_model->unit_quirk) {
|
||||
/*
|
||||
* DRAM domain on HSW server and KNL has fixed energy unit which can be
|
||||
@@ -565,7 +631,6 @@ static int rapl_check_hw_unit(void)
|
||||
break;
|
||||
}
|
||||
|
||||
-
|
||||
/*
|
||||
* Calculate the timer rate:
|
||||
* Use reference of 200W for scaling the timeout to avoid counter
|
||||
@@ -584,9 +649,13 @@ static int rapl_check_hw_unit(void)
|
||||
static void __init rapl_advertise(void)
|
||||
{
|
||||
int i;
|
||||
+ int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
|
||||
+
|
||||
+ if (rapl_pmus_core)
|
||||
+ num_counters += hweight32(rapl_pmus_core->cntr_mask);
|
||||
|
||||
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
|
||||
- hweight32(rapl_pmus_pkg->cntr_mask), rapl_timer_ms);
|
||||
+ num_counters, rapl_timer_ms);
|
||||
|
||||
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
|
||||
if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
|
||||
@@ -594,6 +663,10 @@ static void __init rapl_advertise(void)
|
||||
rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
|
||||
}
|
||||
}
|
||||
+
|
||||
+ if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_PER_CORE)))
|
||||
+ pr_info("hw unit of domain %s 2^-%d Joules\n",
|
||||
+ rapl_core_domain_name, rapl_core_hw_unit);
|
||||
}
|
||||
|
||||
static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
|
||||
@@ -614,6 +687,10 @@ static const struct attribute_group *rap
|
||||
NULL,
|
||||
};
|
||||
|
||||
+static const struct attribute_group *rapl_per_core_attr_update[] = {
|
||||
+ &rapl_events_per_core_group,
|
||||
+};
|
||||
+
|
||||
static void __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
|
||||
{
|
||||
struct rapl_pmu *rapl_pmu;
|
||||
@@ -622,10 +699,9 @@ static void __init init_rapl_pmu(struct
|
||||
cpus_read_lock();
|
||||
|
||||
for_each_cpu(cpu, cpu_online_mask) {
|
||||
- rapl_pmu_idx = get_rapl_pmu_idx(cpu);
|
||||
+ rapl_pmu_idx = get_rapl_pmu_idx(cpu, rapl_pmus->pmu.scope);
|
||||
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
|
||||
continue;
|
||||
-
|
||||
rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
|
||||
if (rapl_pmu)
|
||||
continue;
|
||||
@@ -644,15 +720,19 @@ static void __init init_rapl_pmu(struct
|
||||
cpus_read_unlock();
|
||||
}
|
||||
|
||||
-static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope)
|
||||
+static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
|
||||
+ const struct attribute_group **rapl_attr_groups,
|
||||
+ const struct attribute_group **rapl_attr_update)
|
||||
{
|
||||
int nr_rapl_pmu;
|
||||
struct rapl_pmus *rapl_pmus;
|
||||
|
||||
if (rapl_pmu_scope == PERF_PMU_SCOPE_PKG)
|
||||
nr_rapl_pmu = topology_max_packages();
|
||||
- else
|
||||
+ else if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
|
||||
nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
|
||||
+ else
|
||||
+ nr_rapl_pmu = topology_max_packages() * topology_num_cores_per_package();
|
||||
|
||||
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
|
||||
if (!rapl_pmus)
|
||||
@@ -743,8 +823,10 @@ static struct rapl_model model_spr = {
|
||||
|
||||
static struct rapl_model model_amd_hygon = {
|
||||
.pkg_events = BIT(PERF_RAPL_PKG),
|
||||
+ .core_events = BIT(PERF_RAPL_PER_CORE),
|
||||
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
|
||||
.rapl_pkg_msrs = amd_rapl_pkg_msrs,
|
||||
+ .rapl_core_msrs = amd_rapl_core_msrs,
|
||||
};
|
||||
|
||||
static const struct x86_cpu_id rapl_model_match[] __initconst = {
|
||||
@@ -816,7 +898,8 @@ static int __init rapl_pmu_init(void)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
- ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope);
|
||||
+ ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
|
||||
+ rapl_attr_update);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -828,6 +911,27 @@ static int __init rapl_pmu_init(void)
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
+ if (rapl_model->core_events) {
|
||||
+ ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
|
||||
+ rapl_per_core_attr_groups,
|
||||
+ rapl_per_core_attr_update);
|
||||
+ if (ret) {
|
||||
+ pr_warn("Per-core PMU initialization failed (%d)\n", ret);
|
||||
+ goto per_core_init_failed;
|
||||
+ }
|
||||
+
|
||||
+ rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
|
||||
+ PERF_RAPL_CORE_EVENTS_MAX, false,
|
||||
+ (void *) &rapl_model->core_events);
|
||||
+
|
||||
+ ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_per_core", -1);
|
||||
+ if (ret) {
|
||||
+ pr_warn("Per-core PMU registration failed (%d)\n", ret);
|
||||
+ cleanup_rapl_pmus(rapl_pmus_core);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+per_core_init_failed:
|
||||
rapl_advertise();
|
||||
return 0;
|
||||
|
||||
@@ -840,6 +944,10 @@ module_init(rapl_pmu_init);
|
||||
|
||||
static void __exit intel_rapl_exit(void)
|
||||
{
|
||||
+ if (rapl_pmus_core) {
|
||||
+ perf_pmu_unregister(&rapl_pmus_core->pmu);
|
||||
+ cleanup_rapl_pmus(rapl_pmus_core);
|
||||
+ }
|
||||
perf_pmu_unregister(&rapl_pmus_pkg->pmu);
|
||||
cleanup_rapl_pmus(rapl_pmus_pkg);
|
||||
}
|
180
debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch
vendored
Normal file
180
debian/patches/patchset-pf/cpuidle/0001-cpuidle-menu-Remove-iowait-influence.patch
vendored
Normal file
@@ -0,0 +1,180 @@
|
||||
From d31e903a364802c068ff23bdd448cc70eda71a7c Mon Sep 17 00:00:00 2001
|
||||
From: Christian Loehle <christian.loehle@arm.com>
|
||||
Date: Thu, 5 Sep 2024 10:26:38 +0100
|
||||
Subject: cpuidle: menu: Remove iowait influence
|
||||
|
||||
Remove CPU iowaiters influence on idle state selection.
|
||||
Remove the menu notion of performance multiplier which increased with
|
||||
the number of tasks that went to iowait sleep on this CPU and haven't
|
||||
woken up yet.
|
||||
|
||||
Relying on iowait for cpuidle is problematic for a few reasons:
|
||||
1. There is no guarantee that an iowaiting task will wake up on the
|
||||
same CPU.
|
||||
2. The task being in iowait says nothing about the idle duration, we
|
||||
could be selecting shallower states for a long time.
|
||||
3. The task being in iowait doesn't always imply a performance hit
|
||||
with increased latency.
|
||||
4. If there is such a performance hit, the number of iowaiting tasks
|
||||
doesn't directly correlate.
|
||||
5. The definition of iowait altogether is vague at best, it is
|
||||
sprinkled across kernel code.
|
||||
|
||||
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
---
|
||||
drivers/cpuidle/governors/menu.c | 76 ++++----------------------------
|
||||
1 file changed, 9 insertions(+), 67 deletions(-)
|
||||
|
||||
--- a/drivers/cpuidle/governors/menu.c
|
||||
+++ b/drivers/cpuidle/governors/menu.c
|
||||
@@ -19,7 +19,7 @@
|
||||
|
||||
#include "gov.h"
|
||||
|
||||
-#define BUCKETS 12
|
||||
+#define BUCKETS 6
|
||||
#define INTERVAL_SHIFT 3
|
||||
#define INTERVALS (1UL << INTERVAL_SHIFT)
|
||||
#define RESOLUTION 1024
|
||||
@@ -29,12 +29,11 @@
|
||||
/*
|
||||
* Concepts and ideas behind the menu governor
|
||||
*
|
||||
- * For the menu governor, there are 3 decision factors for picking a C
|
||||
+ * For the menu governor, there are 2 decision factors for picking a C
|
||||
* state:
|
||||
* 1) Energy break even point
|
||||
- * 2) Performance impact
|
||||
- * 3) Latency tolerance (from pmqos infrastructure)
|
||||
- * These three factors are treated independently.
|
||||
+ * 2) Latency tolerance (from pmqos infrastructure)
|
||||
+ * These two factors are treated independently.
|
||||
*
|
||||
* Energy break even point
|
||||
* -----------------------
|
||||
@@ -75,30 +74,6 @@
|
||||
* intervals and if the stand deviation of these 8 intervals is below a
|
||||
* threshold value, we use the average of these intervals as prediction.
|
||||
*
|
||||
- * Limiting Performance Impact
|
||||
- * ---------------------------
|
||||
- * C states, especially those with large exit latencies, can have a real
|
||||
- * noticeable impact on workloads, which is not acceptable for most sysadmins,
|
||||
- * and in addition, less performance has a power price of its own.
|
||||
- *
|
||||
- * As a general rule of thumb, menu assumes that the following heuristic
|
||||
- * holds:
|
||||
- * The busier the system, the less impact of C states is acceptable
|
||||
- *
|
||||
- * This rule-of-thumb is implemented using a performance-multiplier:
|
||||
- * If the exit latency times the performance multiplier is longer than
|
||||
- * the predicted duration, the C state is not considered a candidate
|
||||
- * for selection due to a too high performance impact. So the higher
|
||||
- * this multiplier is, the longer we need to be idle to pick a deep C
|
||||
- * state, and thus the less likely a busy CPU will hit such a deep
|
||||
- * C state.
|
||||
- *
|
||||
- * Currently there is only one value determining the factor:
|
||||
- * 10 points are added for each process that is waiting for IO on this CPU.
|
||||
- * (This value was experimentally determined.)
|
||||
- * Utilization is no longer a factor as it was shown that it never contributed
|
||||
- * significantly to the performance multiplier in the first place.
|
||||
- *
|
||||
*/
|
||||
|
||||
struct menu_device {
|
||||
@@ -112,19 +87,10 @@ struct menu_device {
|
||||
int interval_ptr;
|
||||
};
|
||||
|
||||
-static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
|
||||
+static inline int which_bucket(u64 duration_ns)
|
||||
{
|
||||
int bucket = 0;
|
||||
|
||||
- /*
|
||||
- * We keep two groups of stats; one with no
|
||||
- * IO pending, one without.
|
||||
- * This allows us to calculate
|
||||
- * E(duration)|iowait
|
||||
- */
|
||||
- if (nr_iowaiters)
|
||||
- bucket = BUCKETS/2;
|
||||
-
|
||||
if (duration_ns < 10ULL * NSEC_PER_USEC)
|
||||
return bucket;
|
||||
if (duration_ns < 100ULL * NSEC_PER_USEC)
|
||||
@@ -138,19 +104,6 @@ static inline int which_bucket(u64 durat
|
||||
return bucket + 5;
|
||||
}
|
||||
|
||||
-/*
|
||||
- * Return a multiplier for the exit latency that is intended
|
||||
- * to take performance requirements into account.
|
||||
- * The more performance critical we estimate the system
|
||||
- * to be, the higher this multiplier, and thus the higher
|
||||
- * the barrier to go to an expensive C state.
|
||||
- */
|
||||
-static inline int performance_multiplier(unsigned int nr_iowaiters)
|
||||
-{
|
||||
- /* for IO wait tasks (per cpu!) we add 10x each */
|
||||
- return 1 + 10 * nr_iowaiters;
|
||||
-}
|
||||
-
|
||||
static DEFINE_PER_CPU(struct menu_device, menu_devices);
|
||||
|
||||
static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
|
||||
@@ -258,8 +211,6 @@ static int menu_select(struct cpuidle_dr
|
||||
struct menu_device *data = this_cpu_ptr(&menu_devices);
|
||||
s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
|
||||
u64 predicted_ns;
|
||||
- u64 interactivity_req;
|
||||
- unsigned int nr_iowaiters;
|
||||
ktime_t delta, delta_tick;
|
||||
int i, idx;
|
||||
|
||||
@@ -268,8 +219,6 @@ static int menu_select(struct cpuidle_dr
|
||||
data->needs_update = 0;
|
||||
}
|
||||
|
||||
- nr_iowaiters = nr_iowait_cpu(dev->cpu);
|
||||
-
|
||||
/* Find the shortest expected idle interval. */
|
||||
predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
|
||||
if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
|
||||
@@ -283,7 +232,7 @@ static int menu_select(struct cpuidle_dr
|
||||
}
|
||||
|
||||
data->next_timer_ns = delta;
|
||||
- data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
|
||||
+ data->bucket = which_bucket(data->next_timer_ns);
|
||||
|
||||
/* Round up the result for half microseconds. */
|
||||
timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
|
||||
@@ -301,7 +250,7 @@ static int menu_select(struct cpuidle_dr
|
||||
*/
|
||||
data->next_timer_ns = KTIME_MAX;
|
||||
delta_tick = TICK_NSEC / 2;
|
||||
- data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
|
||||
+ data->bucket = which_bucket(KTIME_MAX);
|
||||
}
|
||||
|
||||
if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
|
||||
@@ -328,15 +277,8 @@ static int menu_select(struct cpuidle_dr
|
||||
*/
|
||||
if (predicted_ns < TICK_NSEC)
|
||||
predicted_ns = data->next_timer_ns;
|
||||
- } else {
|
||||
- /*
|
||||
- * Use the performance multiplier and the user-configurable
|
||||
- * latency_req to determine the maximum exit latency.
|
||||
- */
|
||||
- interactivity_req = div64_u64(predicted_ns,
|
||||
- performance_multiplier(nr_iowaiters));
|
||||
- if (latency_req > interactivity_req)
|
||||
- latency_req = interactivity_req;
|
||||
+ } else if (latency_req > predicted_ns) {
|
||||
+ latency_req = predicted_ns;
|
||||
}
|
||||
|
||||
/*
|
58
debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch
vendored
Normal file
58
debian/patches/patchset-pf/cpuidle/0002-cpuidle-Prefer-teo-over-menu-governor.patch
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
From 3f840a42780323a4437dd1a417488d141c33af15 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Loehle <christian.loehle@arm.com>
|
||||
Date: Thu, 5 Sep 2024 10:26:39 +0100
|
||||
Subject: cpuidle: Prefer teo over menu governor
|
||||
|
||||
Since menu no longer has the interactivity boost teo works better
|
||||
overall, so make it the default.
|
||||
|
||||
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
---
|
||||
drivers/cpuidle/Kconfig | 5 +----
|
||||
drivers/cpuidle/governors/menu.c | 2 +-
|
||||
drivers/cpuidle/governors/teo.c | 2 +-
|
||||
3 files changed, 3 insertions(+), 6 deletions(-)
|
||||
|
||||
--- a/drivers/cpuidle/Kconfig
|
||||
+++ b/drivers/cpuidle/Kconfig
|
||||
@@ -5,7 +5,7 @@ config CPU_IDLE
|
||||
bool "CPU idle PM support"
|
||||
default y if ACPI || PPC_PSERIES
|
||||
select CPU_IDLE_GOV_LADDER if (!NO_HZ && !NO_HZ_IDLE)
|
||||
- select CPU_IDLE_GOV_MENU if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_TEO
|
||||
+ select CPU_IDLE_GOV_TEO if (NO_HZ || NO_HZ_IDLE) && !CPU_IDLE_GOV_MENU
|
||||
help
|
||||
CPU idle is a generic framework for supporting software-controlled
|
||||
idle processor power management. It includes modular cross-platform
|
||||
@@ -30,9 +30,6 @@ config CPU_IDLE_GOV_TEO
|
||||
This governor implements a simplified idle state selection method
|
||||
focused on timer events and does not do any interactivity boosting.
|
||||
|
||||
- Some workloads benefit from using it and it generally should be safe
|
||||
- to use. Say Y here if you are not happy with the alternatives.
|
||||
-
|
||||
config CPU_IDLE_GOV_HALTPOLL
|
||||
bool "Haltpoll governor (for virtualized systems)"
|
||||
depends on KVM_GUEST
|
||||
--- a/drivers/cpuidle/governors/menu.c
|
||||
+++ b/drivers/cpuidle/governors/menu.c
|
||||
@@ -508,7 +508,7 @@ static int menu_enable_device(struct cpu
|
||||
|
||||
static struct cpuidle_governor menu_governor = {
|
||||
.name = "menu",
|
||||
- .rating = 20,
|
||||
+ .rating = 19,
|
||||
.enable = menu_enable_device,
|
||||
.select = menu_select,
|
||||
.reflect = menu_reflect,
|
||||
--- a/drivers/cpuidle/governors/teo.c
|
||||
+++ b/drivers/cpuidle/governors/teo.c
|
||||
@@ -537,7 +537,7 @@ static int teo_enable_device(struct cpui
|
||||
|
||||
static struct cpuidle_governor teo_governor = {
|
||||
.name = "teo",
|
||||
- .rating = 19,
|
||||
+ .rating = 20,
|
||||
.enable = teo_enable_device,
|
||||
.select = teo_select,
|
||||
.reflect = teo_reflect,
|
39
debian/patches/patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch
vendored
Normal file
39
debian/patches/patchset-pf/cpuidle/0003-TEST-cpufreq-schedutil-Linear-iowait-boost-step.patch
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
From ca8c9368b6f28ef625716b03aa930acfb8afe158 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Loehle <christian.loehle@arm.com>
|
||||
Date: Thu, 5 Sep 2024 10:26:40 +0100
|
||||
Subject: TEST: cpufreq/schedutil: Linear iowait boost step
|
||||
|
||||
In preparation for capping iowait boost make the steps linear as
|
||||
opposed to doubling.
|
||||
|
||||
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
---
|
||||
kernel/sched/cpufreq_schedutil.c | 9 ++++-----
|
||||
1 file changed, 4 insertions(+), 5 deletions(-)
|
||||
|
||||
--- a/kernel/sched/cpufreq_schedutil.c
|
||||
+++ b/kernel/sched/cpufreq_schedutil.c
|
||||
@@ -267,7 +267,8 @@ static void sugov_iowait_boost(struct su
|
||||
/* Double the boost at each request */
|
||||
if (sg_cpu->iowait_boost) {
|
||||
sg_cpu->iowait_boost =
|
||||
- min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
|
||||
+ min_t(unsigned int,
|
||||
+ sg_cpu->iowait_boost + IOWAIT_BOOST_MIN, SCHED_CAPACITY_SCALE);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -308,11 +309,9 @@ static unsigned long sugov_iowait_apply(
|
||||
/*
|
||||
* No boost pending; reduce the boost value.
|
||||
*/
|
||||
- sg_cpu->iowait_boost >>= 1;
|
||||
- if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
|
||||
- sg_cpu->iowait_boost = 0;
|
||||
+ sg_cpu->iowait_boost -= IOWAIT_BOOST_MIN;
|
||||
+ if (!sg_cpu->iowait_boost)
|
||||
return 0;
|
||||
- }
|
||||
}
|
||||
|
||||
sg_cpu->iowait_boost_pending = false;
|
106
debian/patches/patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch
vendored
Normal file
106
debian/patches/patchset-pf/cpuidle/0004-TEST-cpufreq-schedutil-iowait-boost-cap-sysfs.patch
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
From 33f05bd16a4ac2f6f36c9eb88016e2375dcb597c Mon Sep 17 00:00:00 2001
|
||||
From: Christian Loehle <christian.loehle@arm.com>
|
||||
Date: Thu, 5 Sep 2024 10:26:41 +0100
|
||||
Subject: TEST: cpufreq/schedutil: iowait boost cap sysfs
|
||||
|
||||
Add a knob to cap applied iowait_boost per sysfs.
|
||||
This is to test for potential regressions.
|
||||
|
||||
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
---
|
||||
kernel/sched/cpufreq_schedutil.c | 38 ++++++++++++++++++++++++++++++++
|
||||
1 file changed, 38 insertions(+)
|
||||
|
||||
--- a/kernel/sched/cpufreq_schedutil.c
|
||||
+++ b/kernel/sched/cpufreq_schedutil.c
|
||||
@@ -11,6 +11,7 @@
|
||||
struct sugov_tunables {
|
||||
struct gov_attr_set attr_set;
|
||||
unsigned int rate_limit_us;
|
||||
+ unsigned int iowait_boost_cap;
|
||||
};
|
||||
|
||||
struct sugov_policy {
|
||||
@@ -35,6 +36,8 @@ struct sugov_policy {
|
||||
|
||||
bool limits_changed;
|
||||
bool need_freq_update;
|
||||
+
|
||||
+ unsigned int iowait_boost_cap;
|
||||
};
|
||||
|
||||
struct sugov_cpu {
|
||||
@@ -316,6 +319,9 @@ static unsigned long sugov_iowait_apply(
|
||||
|
||||
sg_cpu->iowait_boost_pending = false;
|
||||
|
||||
+ if (sg_cpu->iowait_boost > sg_cpu->sg_policy->iowait_boost_cap)
|
||||
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->iowait_boost_cap;
|
||||
+
|
||||
/*
|
||||
* sg_cpu->util is already in capacity scale; convert iowait_boost
|
||||
* into the same scale so we can compare.
|
||||
@@ -554,6 +560,14 @@ static ssize_t rate_limit_us_show(struct
|
||||
return sprintf(buf, "%u\n", tunables->rate_limit_us);
|
||||
}
|
||||
|
||||
+
|
||||
+static ssize_t iowait_boost_cap_show(struct gov_attr_set *attr_set, char *buf)
|
||||
+{
|
||||
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
|
||||
+
|
||||
+ return sprintf(buf, "%u\n", tunables->iowait_boost_cap);
|
||||
+}
|
||||
+
|
||||
static ssize_t
|
||||
rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
|
||||
{
|
||||
@@ -572,10 +586,30 @@ rate_limit_us_store(struct gov_attr_set
|
||||
return count;
|
||||
}
|
||||
|
||||
+static ssize_t
|
||||
+iowait_boost_cap_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
|
||||
+{
|
||||
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
|
||||
+ struct sugov_policy *sg_policy;
|
||||
+ unsigned int iowait_boost_cap;
|
||||
+
|
||||
+ if (kstrtouint(buf, 10, &iowait_boost_cap))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ tunables->iowait_boost_cap = iowait_boost_cap;
|
||||
+
|
||||
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
|
||||
+ sg_policy->iowait_boost_cap = iowait_boost_cap;
|
||||
+
|
||||
+ return count;
|
||||
+}
|
||||
+
|
||||
static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
|
||||
+static struct governor_attr iowait_boost_cap = __ATTR_RW(iowait_boost_cap);
|
||||
|
||||
static struct attribute *sugov_attrs[] = {
|
||||
&rate_limit_us.attr,
|
||||
+ &iowait_boost_cap.attr,
|
||||
NULL
|
||||
};
|
||||
ATTRIBUTE_GROUPS(sugov);
|
||||
@@ -765,6 +799,8 @@ static int sugov_init(struct cpufreq_pol
|
||||
|
||||
tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
|
||||
|
||||
+ tunables->iowait_boost_cap = SCHED_CAPACITY_SCALE;
|
||||
+
|
||||
policy->governor_data = sg_policy;
|
||||
sg_policy->tunables = tunables;
|
||||
|
||||
@@ -834,6 +870,8 @@ static int sugov_start(struct cpufreq_po
|
||||
sg_policy->limits_changed = false;
|
||||
sg_policy->cached_raw_freq = 0;
|
||||
|
||||
+ sg_policy->iowait_boost_cap = SCHED_CAPACITY_SCALE;
|
||||
+
|
||||
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
|
||||
|
||||
if (policy_is_shared(policy))
|
325
debian/patches/patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch
vendored
Normal file
325
debian/patches/patchset-pf/cpuidle/0005-cpufreq-schedutil-Remove-iowait-boost.patch
vendored
Normal file
@@ -0,0 +1,325 @@
|
||||
From 33eb6c08d7c615fad308001921c7b1148cbccfde Mon Sep 17 00:00:00 2001
|
||||
From: Christian Loehle <christian.loehle@arm.com>
|
||||
Date: Thu, 5 Sep 2024 10:26:42 +0100
|
||||
Subject: cpufreq/schedutil: Remove iowait boost
|
||||
|
||||
iowait boost in schedutil was introduced by
|
||||
commit ("21ca6d2c52f8 cpufreq: schedutil: Add iowait boosting").
|
||||
with it more or less following intel_pstate's approach to increase
|
||||
frequency after an iowait wakeup.
|
||||
Behaviour that is piggy-backed onto iowait boost is problematic
|
||||
due to a lot of reasons, so remove it.
|
||||
|
||||
For schedutil specifically these are some of the reasons:
|
||||
1. Boosting is applied even in scenarios where it doesn't improve
|
||||
throughput.
|
||||
2. The boost is not accounted for in EAS: a) feec() will only consider
|
||||
the actual task utilization for task placement, but another CPU might
|
||||
be more energy-efficient at that capacity than the boosted one.)
|
||||
b) When placing a non-IO task while a CPU is boosted compute_energy()
|
||||
assumes a lower OPP than what is actually applied. This leads to
|
||||
wrong EAS decisions.
|
||||
3. Actual IO heavy workloads are hardly distinguished from infrequent
|
||||
in_iowait wakeups.
|
||||
4. The boost isn't accounted for in task placement.
|
||||
5. The boost isn't associated with a task, it therefore lingers on the
|
||||
rq even after the responsible task has migrated / stopped.
|
||||
6. The boost isn't associated with a task, it therefore needs to ramp
|
||||
up again when migrated.
|
||||
7. Since schedutil doesn't know which task is getting woken up,
|
||||
multiple unrelated in_iowait tasks lead to boosting.
|
||||
8. Boosting is hard to control with UCLAMP_MAX (which is only active
|
||||
when the task is on the rq, which for boosted tasks is usually not
|
||||
the case for most of the time).
|
||||
|
||||
One benefit of schedutil specifically is the reliance on the
|
||||
scheduler's utilization signals, which have evolved a lot since it's
|
||||
original introduction. Some cases that benefitted from iowait boosting
|
||||
in the past can now be covered by e.g. util_est.
|
||||
|
||||
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
---
|
||||
kernel/sched/cpufreq_schedutil.c | 181 +------------------------------
|
||||
1 file changed, 3 insertions(+), 178 deletions(-)
|
||||
|
||||
--- a/kernel/sched/cpufreq_schedutil.c
|
||||
+++ b/kernel/sched/cpufreq_schedutil.c
|
||||
@@ -6,12 +6,9 @@
|
||||
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
*/
|
||||
|
||||
-#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
|
||||
-
|
||||
struct sugov_tunables {
|
||||
struct gov_attr_set attr_set;
|
||||
unsigned int rate_limit_us;
|
||||
- unsigned int iowait_boost_cap;
|
||||
};
|
||||
|
||||
struct sugov_policy {
|
||||
@@ -36,8 +33,6 @@ struct sugov_policy {
|
||||
|
||||
bool limits_changed;
|
||||
bool need_freq_update;
|
||||
-
|
||||
- unsigned int iowait_boost_cap;
|
||||
};
|
||||
|
||||
struct sugov_cpu {
|
||||
@@ -45,10 +40,6 @@ struct sugov_cpu {
|
||||
struct sugov_policy *sg_policy;
|
||||
unsigned int cpu;
|
||||
|
||||
- bool iowait_boost_pending;
|
||||
- unsigned int iowait_boost;
|
||||
- u64 last_update;
|
||||
-
|
||||
unsigned long util;
|
||||
unsigned long bw_min;
|
||||
|
||||
@@ -198,137 +189,15 @@ unsigned long sugov_effective_cpu_perf(i
|
||||
return max(min, max);
|
||||
}
|
||||
|
||||
-static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
|
||||
+static void sugov_get_util(struct sugov_cpu *sg_cpu)
|
||||
{
|
||||
unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
|
||||
|
||||
util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
|
||||
- util = max(util, boost);
|
||||
sg_cpu->bw_min = min;
|
||||
sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
|
||||
}
|
||||
|
||||
-/**
|
||||
- * sugov_iowait_reset() - Reset the IO boost status of a CPU.
|
||||
- * @sg_cpu: the sugov data for the CPU to boost
|
||||
- * @time: the update time from the caller
|
||||
- * @set_iowait_boost: true if an IO boost has been requested
|
||||
- *
|
||||
- * The IO wait boost of a task is disabled after a tick since the last update
|
||||
- * of a CPU. If a new IO wait boost is requested after more then a tick, then
|
||||
- * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
|
||||
- * efficiency by ignoring sporadic wakeups from IO.
|
||||
- */
|
||||
-static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
|
||||
- bool set_iowait_boost)
|
||||
-{
|
||||
- s64 delta_ns = time - sg_cpu->last_update;
|
||||
-
|
||||
- /* Reset boost only if a tick has elapsed since last request */
|
||||
- if (delta_ns <= TICK_NSEC)
|
||||
- return false;
|
||||
-
|
||||
- sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
|
||||
- sg_cpu->iowait_boost_pending = set_iowait_boost;
|
||||
-
|
||||
- return true;
|
||||
-}
|
||||
-
|
||||
-/**
|
||||
- * sugov_iowait_boost() - Updates the IO boost status of a CPU.
|
||||
- * @sg_cpu: the sugov data for the CPU to boost
|
||||
- * @time: the update time from the caller
|
||||
- * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
|
||||
- *
|
||||
- * Each time a task wakes up after an IO operation, the CPU utilization can be
|
||||
- * boosted to a certain utilization which doubles at each "frequent and
|
||||
- * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
|
||||
- * of the maximum OPP.
|
||||
- *
|
||||
- * To keep doubling, an IO boost has to be requested at least once per tick,
|
||||
- * otherwise we restart from the utilization of the minimum OPP.
|
||||
- */
|
||||
-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
|
||||
- unsigned int flags)
|
||||
-{
|
||||
- bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
|
||||
-
|
||||
- /* Reset boost if the CPU appears to have been idle enough */
|
||||
- if (sg_cpu->iowait_boost &&
|
||||
- sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
|
||||
- return;
|
||||
-
|
||||
- /* Boost only tasks waking up after IO */
|
||||
- if (!set_iowait_boost)
|
||||
- return;
|
||||
-
|
||||
- /* Ensure boost doubles only one time at each request */
|
||||
- if (sg_cpu->iowait_boost_pending)
|
||||
- return;
|
||||
- sg_cpu->iowait_boost_pending = true;
|
||||
-
|
||||
- /* Double the boost at each request */
|
||||
- if (sg_cpu->iowait_boost) {
|
||||
- sg_cpu->iowait_boost =
|
||||
- min_t(unsigned int,
|
||||
- sg_cpu->iowait_boost + IOWAIT_BOOST_MIN, SCHED_CAPACITY_SCALE);
|
||||
- return;
|
||||
- }
|
||||
-
|
||||
- /* First wakeup after IO: start with minimum boost */
|
||||
- sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
|
||||
-}
|
||||
-
|
||||
-/**
|
||||
- * sugov_iowait_apply() - Apply the IO boost to a CPU.
|
||||
- * @sg_cpu: the sugov data for the cpu to boost
|
||||
- * @time: the update time from the caller
|
||||
- * @max_cap: the max CPU capacity
|
||||
- *
|
||||
- * A CPU running a task which woken up after an IO operation can have its
|
||||
- * utilization boosted to speed up the completion of those IO operations.
|
||||
- * The IO boost value is increased each time a task wakes up from IO, in
|
||||
- * sugov_iowait_apply(), and it's instead decreased by this function,
|
||||
- * each time an increase has not been requested (!iowait_boost_pending).
|
||||
- *
|
||||
- * A CPU which also appears to have been idle for at least one tick has also
|
||||
- * its IO boost utilization reset.
|
||||
- *
|
||||
- * This mechanism is designed to boost high frequently IO waiting tasks, while
|
||||
- * being more conservative on tasks which does sporadic IO operations.
|
||||
- */
|
||||
-static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
|
||||
- unsigned long max_cap)
|
||||
-{
|
||||
- /* No boost currently required */
|
||||
- if (!sg_cpu->iowait_boost)
|
||||
- return 0;
|
||||
-
|
||||
- /* Reset boost if the CPU appears to have been idle enough */
|
||||
- if (sugov_iowait_reset(sg_cpu, time, false))
|
||||
- return 0;
|
||||
-
|
||||
- if (!sg_cpu->iowait_boost_pending) {
|
||||
- /*
|
||||
- * No boost pending; reduce the boost value.
|
||||
- */
|
||||
- sg_cpu->iowait_boost -= IOWAIT_BOOST_MIN;
|
||||
- if (!sg_cpu->iowait_boost)
|
||||
- return 0;
|
||||
- }
|
||||
-
|
||||
- sg_cpu->iowait_boost_pending = false;
|
||||
-
|
||||
- if (sg_cpu->iowait_boost > sg_cpu->sg_policy->iowait_boost_cap)
|
||||
- sg_cpu->iowait_boost = sg_cpu->sg_policy->iowait_boost_cap;
|
||||
-
|
||||
- /*
|
||||
- * sg_cpu->util is already in capacity scale; convert iowait_boost
|
||||
- * into the same scale so we can compare.
|
||||
- */
|
||||
- return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
|
||||
-}
|
||||
-
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
|
||||
{
|
||||
@@ -356,18 +225,12 @@ static inline bool sugov_update_single_c
|
||||
u64 time, unsigned long max_cap,
|
||||
unsigned int flags)
|
||||
{
|
||||
- unsigned long boost;
|
||||
-
|
||||
- sugov_iowait_boost(sg_cpu, time, flags);
|
||||
- sg_cpu->last_update = time;
|
||||
-
|
||||
ignore_dl_rate_limit(sg_cpu);
|
||||
|
||||
if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
|
||||
return false;
|
||||
|
||||
- boost = sugov_iowait_apply(sg_cpu, time, max_cap);
|
||||
- sugov_get_util(sg_cpu, boost);
|
||||
+ sugov_get_util(sg_cpu);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -468,11 +331,8 @@ static unsigned int sugov_next_freq_shar
|
||||
|
||||
for_each_cpu(j, policy->cpus) {
|
||||
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
|
||||
- unsigned long boost;
|
||||
-
|
||||
- boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
|
||||
- sugov_get_util(j_sg_cpu, boost);
|
||||
|
||||
+ sugov_get_util(j_sg_cpu);
|
||||
util = max(j_sg_cpu->util, util);
|
||||
}
|
||||
|
||||
@@ -488,9 +348,6 @@ sugov_update_shared(struct update_util_d
|
||||
|
||||
raw_spin_lock(&sg_policy->update_lock);
|
||||
|
||||
- sugov_iowait_boost(sg_cpu, time, flags);
|
||||
- sg_cpu->last_update = time;
|
||||
-
|
||||
ignore_dl_rate_limit(sg_cpu);
|
||||
|
||||
if (sugov_should_update_freq(sg_policy, time)) {
|
||||
@@ -560,14 +417,6 @@ static ssize_t rate_limit_us_show(struct
|
||||
return sprintf(buf, "%u\n", tunables->rate_limit_us);
|
||||
}
|
||||
|
||||
-
|
||||
-static ssize_t iowait_boost_cap_show(struct gov_attr_set *attr_set, char *buf)
|
||||
-{
|
||||
- struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
|
||||
-
|
||||
- return sprintf(buf, "%u\n", tunables->iowait_boost_cap);
|
||||
-}
|
||||
-
|
||||
static ssize_t
|
||||
rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
|
||||
{
|
||||
@@ -586,30 +435,10 @@ rate_limit_us_store(struct gov_attr_set
|
||||
return count;
|
||||
}
|
||||
|
||||
-static ssize_t
|
||||
-iowait_boost_cap_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
|
||||
-{
|
||||
- struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
|
||||
- struct sugov_policy *sg_policy;
|
||||
- unsigned int iowait_boost_cap;
|
||||
-
|
||||
- if (kstrtouint(buf, 10, &iowait_boost_cap))
|
||||
- return -EINVAL;
|
||||
-
|
||||
- tunables->iowait_boost_cap = iowait_boost_cap;
|
||||
-
|
||||
- list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
|
||||
- sg_policy->iowait_boost_cap = iowait_boost_cap;
|
||||
-
|
||||
- return count;
|
||||
-}
|
||||
-
|
||||
static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
|
||||
-static struct governor_attr iowait_boost_cap = __ATTR_RW(iowait_boost_cap);
|
||||
|
||||
static struct attribute *sugov_attrs[] = {
|
||||
&rate_limit_us.attr,
|
||||
- &iowait_boost_cap.attr,
|
||||
NULL
|
||||
};
|
||||
ATTRIBUTE_GROUPS(sugov);
|
||||
@@ -799,8 +628,6 @@ static int sugov_init(struct cpufreq_pol
|
||||
|
||||
tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
|
||||
|
||||
- tunables->iowait_boost_cap = SCHED_CAPACITY_SCALE;
|
||||
-
|
||||
policy->governor_data = sg_policy;
|
||||
sg_policy->tunables = tunables;
|
||||
|
||||
@@ -870,8 +697,6 @@ static int sugov_start(struct cpufreq_po
|
||||
sg_policy->limits_changed = false;
|
||||
sg_policy->cached_raw_freq = 0;
|
||||
|
||||
- sg_policy->iowait_boost_cap = SCHED_CAPACITY_SCALE;
|
||||
-
|
||||
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
|
||||
|
||||
if (policy_is_shared(policy))
|
113
debian/patches/patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch
vendored
Normal file
113
debian/patches/patchset-pf/cpuidle/0006-cpufreq-intel_pstate-Remove-iowait-boost.patch
vendored
Normal file
@@ -0,0 +1,113 @@
|
||||
From af7bbb59c2411e985a5d79173af5686337b4af9b Mon Sep 17 00:00:00 2001
|
||||
From: Christian Loehle <christian.loehle@arm.com>
|
||||
Date: Thu, 5 Sep 2024 10:26:43 +0100
|
||||
Subject: cpufreq: intel_pstate: Remove iowait boost
|
||||
|
||||
Analogous to schedutil, remove iowait boost for the same reasons.
|
||||
|
||||
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
---
|
||||
drivers/cpufreq/intel_pstate.c | 50 ++--------------------------------
|
||||
1 file changed, 3 insertions(+), 47 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/intel_pstate.c
|
||||
+++ b/drivers/cpufreq/intel_pstate.c
|
||||
@@ -191,7 +191,6 @@ struct global_params {
|
||||
* @policy: CPUFreq policy value
|
||||
* @update_util: CPUFreq utility callback information
|
||||
* @update_util_set: CPUFreq utility callback is set
|
||||
- * @iowait_boost: iowait-related boost fraction
|
||||
* @last_update: Time of the last update.
|
||||
* @pstate: Stores P state limits for this CPU
|
||||
* @vid: Stores VID limits for this CPU
|
||||
@@ -245,7 +244,6 @@ struct cpudata {
|
||||
struct acpi_processor_performance acpi_perf_data;
|
||||
bool valid_pss_table;
|
||||
#endif
|
||||
- unsigned int iowait_boost;
|
||||
s16 epp_powersave;
|
||||
s16 epp_policy;
|
||||
s16 epp_default;
|
||||
@@ -2136,28 +2134,7 @@ static inline void intel_pstate_update_u
|
||||
{
|
||||
cpu->sample.time = time;
|
||||
|
||||
- if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) {
|
||||
- bool do_io = false;
|
||||
-
|
||||
- cpu->sched_flags = 0;
|
||||
- /*
|
||||
- * Set iowait_boost flag and update time. Since IO WAIT flag
|
||||
- * is set all the time, we can't just conclude that there is
|
||||
- * some IO bound activity is scheduled on this CPU with just
|
||||
- * one occurrence. If we receive at least two in two
|
||||
- * consecutive ticks, then we treat as boost candidate.
|
||||
- */
|
||||
- if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC))
|
||||
- do_io = true;
|
||||
-
|
||||
- cpu->last_io_update = time;
|
||||
-
|
||||
- if (do_io)
|
||||
- intel_pstate_hwp_boost_up(cpu);
|
||||
-
|
||||
- } else {
|
||||
- intel_pstate_hwp_boost_down(cpu);
|
||||
- }
|
||||
+ intel_pstate_hwp_boost_down(cpu);
|
||||
}
|
||||
|
||||
static inline void intel_pstate_update_util_hwp(struct update_util_data *data,
|
||||
@@ -2240,9 +2217,6 @@ static inline int32_t get_target_pstate(
|
||||
busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift,
|
||||
sample->tsc);
|
||||
|
||||
- if (busy_frac < cpu->iowait_boost)
|
||||
- busy_frac = cpu->iowait_boost;
|
||||
-
|
||||
sample->busy_scaled = busy_frac * 100;
|
||||
|
||||
target = READ_ONCE(global.no_turbo) ?
|
||||
@@ -2303,7 +2277,7 @@ static void intel_pstate_adjust_pstate(s
|
||||
sample->aperf,
|
||||
sample->tsc,
|
||||
get_avg_frequency(cpu),
|
||||
- fp_toint(cpu->iowait_boost * 100));
|
||||
+ 0);
|
||||
}
|
||||
|
||||
static void intel_pstate_update_util(struct update_util_data *data, u64 time,
|
||||
@@ -2317,24 +2291,6 @@ static void intel_pstate_update_util(str
|
||||
return;
|
||||
|
||||
delta_ns = time - cpu->last_update;
|
||||
- if (flags & SCHED_CPUFREQ_IOWAIT) {
|
||||
- /* Start over if the CPU may have been idle. */
|
||||
- if (delta_ns > TICK_NSEC) {
|
||||
- cpu->iowait_boost = ONE_EIGHTH_FP;
|
||||
- } else if (cpu->iowait_boost >= ONE_EIGHTH_FP) {
|
||||
- cpu->iowait_boost <<= 1;
|
||||
- if (cpu->iowait_boost > int_tofp(1))
|
||||
- cpu->iowait_boost = int_tofp(1);
|
||||
- } else {
|
||||
- cpu->iowait_boost = ONE_EIGHTH_FP;
|
||||
- }
|
||||
- } else if (cpu->iowait_boost) {
|
||||
- /* Clear iowait_boost if the CPU may have been idle. */
|
||||
- if (delta_ns > TICK_NSEC)
|
||||
- cpu->iowait_boost = 0;
|
||||
- else
|
||||
- cpu->iowait_boost >>= 1;
|
||||
- }
|
||||
cpu->last_update = time;
|
||||
delta_ns = time - cpu->sample.time;
|
||||
if ((s64)delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL)
|
||||
@@ -2832,7 +2788,7 @@ static void intel_cpufreq_trace(struct c
|
||||
sample->aperf,
|
||||
sample->tsc,
|
||||
get_avg_frequency(cpu),
|
||||
- fp_toint(cpu->iowait_boost * 100));
|
||||
+ 0);
|
||||
}
|
||||
|
||||
static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max,
|
42
debian/patches/patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch
vendored
Normal file
42
debian/patches/patchset-pf/cpuidle/0007-cpufreq-Remove-SCHED_CPUFREQ_IOWAIT-update.patch
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
From fd1e0723b0a7ad140d2bf7cd9154997d5ece2b37 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Loehle <christian.loehle@arm.com>
|
||||
Date: Thu, 5 Sep 2024 10:26:44 +0100
|
||||
Subject: cpufreq: Remove SCHED_CPUFREQ_IOWAIT update
|
||||
|
||||
Neither intel_pstate nor schedutil care for the flag anymore, so
|
||||
remove the update and flag definition.
|
||||
|
||||
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
---
|
||||
include/linux/sched/cpufreq.h | 2 --
|
||||
kernel/sched/fair.c | 8 --------
|
||||
2 files changed, 10 deletions(-)
|
||||
|
||||
--- a/include/linux/sched/cpufreq.h
|
||||
+++ b/include/linux/sched/cpufreq.h
|
||||
@@ -8,8 +8,6 @@
|
||||
* Interface between cpufreq drivers and the scheduler:
|
||||
*/
|
||||
|
||||
-#define SCHED_CPUFREQ_IOWAIT (1U << 0)
|
||||
-
|
||||
#ifdef CONFIG_CPU_FREQ
|
||||
struct cpufreq_policy;
|
||||
|
||||
--- a/kernel/sched/fair.c
|
||||
+++ b/kernel/sched/fair.c
|
||||
@@ -6768,14 +6768,6 @@ enqueue_task_fair(struct rq *rq, struct
|
||||
*/
|
||||
util_est_enqueue(&rq->cfs, p);
|
||||
|
||||
- /*
|
||||
- * If in_iowait is set, the code below may not trigger any cpufreq
|
||||
- * utilization updates, so do it here explicitly with the IOWAIT flag
|
||||
- * passed.
|
||||
- */
|
||||
- if (p->in_iowait)
|
||||
- cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
|
||||
-
|
||||
for_each_sched_entity(se) {
|
||||
if (se->on_rq)
|
||||
break;
|
55
debian/patches/patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch
vendored
Normal file
55
debian/patches/patchset-pf/cpuidle/0008-io_uring-Do-not-set-iowait-before-sleeping.patch
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
From 30cdb8d7d06f51bb86142c537ea05bd01c31bb40 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Loehle <christian.loehle@arm.com>
|
||||
Date: Thu, 5 Sep 2024 10:26:45 +0100
|
||||
Subject: io_uring: Do not set iowait before sleeping
|
||||
|
||||
Setting in_iowait was introduced in commit
|
||||
8a796565cec3 ("io_uring: Use io_schedule* in cqring wait")
|
||||
to tackle a perf regression that was caused by menu taking iowait into
|
||||
account for synchronous IO and thus not selecting deeper states like in
|
||||
the io_uring counterpart.
|
||||
That behaviour is gone, so the workaround can be removed.
|
||||
|
||||
Signed-off-by: Christian Loehle <christian.loehle@arm.com>
|
||||
---
|
||||
io_uring/io_uring.c | 17 -----------------
|
||||
1 file changed, 17 deletions(-)
|
||||
|
||||
--- a/io_uring/io_uring.c
|
||||
+++ b/io_uring/io_uring.c
|
||||
@@ -2359,15 +2359,6 @@ int io_run_task_work_sig(struct io_ring_
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static bool current_pending_io(void)
|
||||
-{
|
||||
- struct io_uring_task *tctx = current->io_uring;
|
||||
-
|
||||
- if (!tctx)
|
||||
- return false;
|
||||
- return percpu_counter_read_positive(&tctx->inflight);
|
||||
-}
|
||||
-
|
||||
/* when returns >0, the caller should retry */
|
||||
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
|
||||
struct io_wait_queue *iowq)
|
||||
@@ -2385,19 +2376,11 @@ static inline int io_cqring_wait_schedul
|
||||
if (unlikely(io_should_wake(iowq)))
|
||||
return 0;
|
||||
|
||||
- /*
|
||||
- * Mark us as being in io_wait if we have pending requests, so cpufreq
|
||||
- * can take into account that the task is waiting for IO - turns out
|
||||
- * to be important for low QD IO.
|
||||
- */
|
||||
- if (current_pending_io())
|
||||
- current->in_iowait = 1;
|
||||
ret = 0;
|
||||
if (iowq->timeout == KTIME_MAX)
|
||||
schedule();
|
||||
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
|
||||
ret = -ETIME;
|
||||
- current->in_iowait = 0;
|
||||
return ret;
|
||||
}
|
||||
|
181
debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch
vendored
Normal file
181
debian/patches/patchset-pf/crypto/0001-crypto-x86-crc32c-simplify-code-for-handling-fewer-t.patch
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
From e6b67a8d14e86d63062e6f1f234c5afc235561d4 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Biggers <ebiggers@google.com>
|
||||
Date: Sun, 13 Oct 2024 21:06:49 -0700
|
||||
Subject: crypto: x86/crc32c - simplify code for handling fewer than 200 bytes
|
||||
|
||||
The assembly code in crc32c-pcl-intel-asm_64.S is invoked only for
|
||||
lengths >= 512, due to the overhead of saving and restoring FPU state.
|
||||
Therefore, it is unnecessary for this code to be excessively "optimized"
|
||||
for lengths < 200. Eliminate the excessive unrolling of this part of
|
||||
the code and use a more straightforward qword-at-a-time loop.
|
||||
|
||||
Note: the part of the code in question is not entirely redundant, as it
|
||||
is still used to process any remainder mod 24, as well as any remaining
|
||||
data when fewer than 200 bytes remain after least one 3072-byte chunk.
|
||||
|
||||
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
||||
---
|
||||
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 116 ++++++----------------
|
||||
1 file changed, 33 insertions(+), 83 deletions(-)
|
||||
|
||||
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
@@ -56,20 +56,10 @@
|
||||
.quad .Lcrc_\i
|
||||
.endm
|
||||
|
||||
-.macro JNC_LESS_THAN j
|
||||
- jnc .Lless_than_\j
|
||||
-.endm
|
||||
-
|
||||
-# Define threshold where buffers are considered "small" and routed to more
|
||||
-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
|
||||
-# SMALL_SIZE can be no larger than 255.
|
||||
-
|
||||
+# Define threshold below which buffers are considered "small" and routed to
|
||||
+# regular CRC code that does not interleave the CRC instructions.
|
||||
#define SMALL_SIZE 200
|
||||
|
||||
-.if (SMALL_SIZE > 255)
|
||||
-.error "SMALL_ SIZE must be < 256"
|
||||
-.endif
|
||||
-
|
||||
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
|
||||
|
||||
.text
|
||||
@@ -100,25 +90,18 @@ SYM_FUNC_START(crc_pcl)
|
||||
## Move crc_init for Linux to a different
|
||||
mov crc_init_arg, crc_init
|
||||
|
||||
+ mov %bufp, bufptmp # rdi = *buf
|
||||
+ cmp $SMALL_SIZE, len
|
||||
+ jb .Lsmall
|
||||
+
|
||||
################################################################
|
||||
## 1) ALIGN:
|
||||
################################################################
|
||||
-
|
||||
- mov %bufp, bufptmp # rdi = *buf
|
||||
neg %bufp
|
||||
and $7, %bufp # calculate the unalignment amount of
|
||||
# the address
|
||||
je .Lproc_block # Skip if aligned
|
||||
|
||||
- ## If len is less than 8 and we're unaligned, we need to jump
|
||||
- ## to special code to avoid reading beyond the end of the buffer
|
||||
- cmp $8, len
|
||||
- jae .Ldo_align
|
||||
- # less_than_8 expects length in upper 3 bits of len_dw
|
||||
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||||
- shl $32-3+1, len_dw
|
||||
- jmp .Lless_than_8_post_shl1
|
||||
-
|
||||
.Ldo_align:
|
||||
#### Calculate CRC of unaligned bytes of the buffer (if any)
|
||||
movq (bufptmp), tmp # load a quadward from the buffer
|
||||
@@ -144,9 +127,6 @@ SYM_FUNC_START(crc_pcl)
|
||||
jae .Lfull_block
|
||||
|
||||
.Lcontinue_block:
|
||||
- cmpq $SMALL_SIZE, len
|
||||
- jb .Lsmall
|
||||
-
|
||||
## len < 128*24
|
||||
movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||||
mul len_dw
|
||||
@@ -243,68 +223,38 @@ LABEL crc_ 0
|
||||
mov tmp, len
|
||||
cmp $128*24, tmp
|
||||
jae .Lfull_block
|
||||
- cmp $24, tmp
|
||||
+ cmp $SMALL_SIZE, tmp
|
||||
jae .Lcontinue_block
|
||||
|
||||
-.Lless_than_24:
|
||||
- shl $32-4, len_dw # less_than_16 expects length
|
||||
- # in upper 4 bits of len_dw
|
||||
- jnc .Lless_than_16
|
||||
- crc32q (bufptmp), crc_init
|
||||
- crc32q 8(bufptmp), crc_init
|
||||
- jz .Ldo_return
|
||||
- add $16, bufptmp
|
||||
- # len is less than 8 if we got here
|
||||
- # less_than_8 expects length in upper 3 bits of len_dw
|
||||
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||||
- shl $2, len_dw
|
||||
- jmp .Lless_than_8_post_shl1
|
||||
-
|
||||
#######################################################################
|
||||
- ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
|
||||
+ ## 6) Process any remainder without interleaving:
|
||||
#######################################################################
|
||||
.Lsmall:
|
||||
- shl $32-8, len_dw # Prepare len_dw for less_than_256
|
||||
- j=256
|
||||
-.rept 5 # j = {256, 128, 64, 32, 16}
|
||||
-.altmacro
|
||||
-LABEL less_than_ %j # less_than_j: Length should be in
|
||||
- # upper lg(j) bits of len_dw
|
||||
- j=(j/2)
|
||||
- shl $1, len_dw # Get next MSB
|
||||
- JNC_LESS_THAN %j
|
||||
-.noaltmacro
|
||||
- i=0
|
||||
-.rept (j/8)
|
||||
- crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
|
||||
- i=i+8
|
||||
-.endr
|
||||
- jz .Ldo_return # Return if remaining length is zero
|
||||
- add $j, bufptmp # Advance buf
|
||||
-.endr
|
||||
-
|
||||
-.Lless_than_8: # Length should be stored in
|
||||
- # upper 3 bits of len_dw
|
||||
- shl $1, len_dw
|
||||
-.Lless_than_8_post_shl1:
|
||||
- jnc .Lless_than_4
|
||||
- crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
|
||||
- jz .Ldo_return # return if remaining data is zero
|
||||
- add $4, bufptmp
|
||||
-.Lless_than_4: # Length should be stored in
|
||||
- # upper 2 bits of len_dw
|
||||
- shl $1, len_dw
|
||||
- jnc .Lless_than_2
|
||||
- crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
|
||||
- jz .Ldo_return # return if remaining data is zero
|
||||
- add $2, bufptmp
|
||||
-.Lless_than_2: # Length should be stored in the MSB
|
||||
- # of len_dw
|
||||
- shl $1, len_dw
|
||||
- jnc .Lless_than_1
|
||||
- crc32b (bufptmp), crc_init_dw # CRC of 1 byte
|
||||
-.Lless_than_1: # Length should be zero
|
||||
-.Ldo_return:
|
||||
+ test len, len
|
||||
+ jz .Ldone
|
||||
+ mov len_dw, %eax
|
||||
+ shr $3, %eax
|
||||
+ jz .Ldo_dword
|
||||
+.Ldo_qwords:
|
||||
+ crc32q (bufptmp), crc_init
|
||||
+ add $8, bufptmp
|
||||
+ dec %eax
|
||||
+ jnz .Ldo_qwords
|
||||
+.Ldo_dword:
|
||||
+ test $4, len_dw
|
||||
+ jz .Ldo_word
|
||||
+ crc32l (bufptmp), crc_init_dw
|
||||
+ add $4, bufptmp
|
||||
+.Ldo_word:
|
||||
+ test $2, len_dw
|
||||
+ jz .Ldo_byte
|
||||
+ crc32w (bufptmp), crc_init_dw
|
||||
+ add $2, bufptmp
|
||||
+.Ldo_byte:
|
||||
+ test $1, len_dw
|
||||
+ jz .Ldone
|
||||
+ crc32b (bufptmp), crc_init_dw
|
||||
+.Ldone:
|
||||
movq crc_init, %rax
|
||||
popq %rsi
|
||||
popq %rdi
|
187
debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
vendored
Normal file
187
debian/patches/patchset-pf/crypto/0002-crypto-x86-crc32c-access-32-bit-arguments-as-32-bit.patch
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
From 430478d63b1403878f2fd4b12de2cd21ee502184 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Biggers <ebiggers@google.com>
|
||||
Date: Sun, 13 Oct 2024 21:06:49 -0700
|
||||
Subject: crypto: x86/crc32c - access 32-bit arguments as 32-bit
|
||||
|
||||
Fix crc32c-pcl-intel-asm_64.S to access 32-bit arguments as 32-bit
|
||||
values instead of 64-bit, since the upper bits of the corresponding
|
||||
64-bit registers are not guaranteed to be zero. Also update the type of
|
||||
the length argument to be unsigned int rather than int, as the assembly
|
||||
code treats it as unsigned.
|
||||
|
||||
Note: there haven't been any reports of this bug actually causing
|
||||
incorrect behavior. Neither gcc nor clang guarantee zero-extension to
|
||||
64 bits, but zero-extension is likely to happen in practice because most
|
||||
instructions that operate on 32-bit registers zero-extend to 64 bits.
|
||||
|
||||
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
||||
---
|
||||
arch/x86/crypto/crc32c-intel_glue.c | 2 +-
|
||||
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 57 +++++++++++------------
|
||||
2 files changed, 27 insertions(+), 32 deletions(-)
|
||||
|
||||
--- a/arch/x86/crypto/crc32c-intel_glue.c
|
||||
+++ b/arch/x86/crypto/crc32c-intel_glue.c
|
||||
@@ -41,7 +41,7 @@
|
||||
*/
|
||||
#define CRC32C_PCL_BREAKEVEN 512
|
||||
|
||||
-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
|
||||
+asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
|
||||
unsigned int crc_init);
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
@@ -60,7 +60,7 @@
|
||||
# regular CRC code that does not interleave the CRC instructions.
|
||||
#define SMALL_SIZE 200
|
||||
|
||||
-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
|
||||
+# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
|
||||
|
||||
.text
|
||||
SYM_FUNC_START(crc_pcl)
|
||||
@@ -72,14 +72,11 @@ SYM_FUNC_START(crc_pcl)
|
||||
#define block_0 %rcx
|
||||
#define block_1 %rdx
|
||||
#define block_2 %r11
|
||||
-#define len %rsi
|
||||
-#define len_dw %esi
|
||||
-#define len_w %si
|
||||
-#define len_b %sil
|
||||
-#define crc_init_arg %rdx
|
||||
+#define len %esi
|
||||
+#define crc_init_arg %edx
|
||||
#define tmp %rbx
|
||||
-#define crc_init %r8
|
||||
-#define crc_init_dw %r8d
|
||||
+#define crc_init %r8d
|
||||
+#define crc_init_q %r8
|
||||
#define crc1 %r9
|
||||
#define crc2 %r10
|
||||
|
||||
@@ -107,9 +104,9 @@ SYM_FUNC_START(crc_pcl)
|
||||
movq (bufptmp), tmp # load a quadward from the buffer
|
||||
add %bufp, bufptmp # align buffer pointer for quadword
|
||||
# processing
|
||||
- sub %bufp, len # update buffer length
|
||||
+ sub bufp_dw, len # update buffer length
|
||||
.Lalign_loop:
|
||||
- crc32b %bl, crc_init_dw # compute crc32 of 1-byte
|
||||
+ crc32b %bl, crc_init # compute crc32 of 1-byte
|
||||
shr $8, tmp # get next byte
|
||||
dec %bufp
|
||||
jne .Lalign_loop
|
||||
@@ -121,15 +118,14 @@ SYM_FUNC_START(crc_pcl)
|
||||
################################################################
|
||||
|
||||
## compute num of bytes to be processed
|
||||
- movq len, tmp # save num bytes in tmp
|
||||
|
||||
- cmpq $128*24, len
|
||||
+ cmp $128*24, len
|
||||
jae .Lfull_block
|
||||
|
||||
.Lcontinue_block:
|
||||
## len < 128*24
|
||||
movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||||
- mul len_dw
|
||||
+ mul len
|
||||
shrq $16, %rax
|
||||
|
||||
## eax contains floor(bytes / 24) = num 24-byte chunks to do
|
||||
@@ -176,7 +172,7 @@ SYM_FUNC_START(crc_pcl)
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
ENDBR
|
||||
- crc32q -i*8(block_0), crc_init
|
||||
+ crc32q -i*8(block_0), crc_init_q
|
||||
crc32q -i*8(block_1), crc1
|
||||
crc32q -i*8(block_2), crc2
|
||||
i=(i-1)
|
||||
@@ -186,7 +182,7 @@ LABEL crc_ %i
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
ENDBR
|
||||
- crc32q -i*8(block_0), crc_init
|
||||
+ crc32q -i*8(block_0), crc_init_q
|
||||
crc32q -i*8(block_1), crc1
|
||||
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
|
||||
|
||||
@@ -200,9 +196,9 @@ LABEL crc_ %i
|
||||
shlq $3, %rax # rax *= 8
|
||||
pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
|
||||
leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
|
||||
- subq %rax, tmp # tmp -= rax*24
|
||||
+ sub %eax, len # len -= rax*24
|
||||
|
||||
- movq crc_init, %xmm1 # CRC for block 1
|
||||
+ movq crc_init_q, %xmm1 # CRC for block 1
|
||||
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
|
||||
|
||||
movq crc1, %xmm2 # CRC for block 2
|
||||
@@ -211,8 +207,8 @@ LABEL crc_ %i
|
||||
pxor %xmm2,%xmm1
|
||||
movq %xmm1, %rax
|
||||
xor -i*8(block_2), %rax
|
||||
- mov crc2, crc_init
|
||||
- crc32 %rax, crc_init
|
||||
+ mov crc2, crc_init_q
|
||||
+ crc32 %rax, crc_init_q
|
||||
|
||||
################################################################
|
||||
## 5) Check for end:
|
||||
@@ -220,10 +216,9 @@ LABEL crc_ %i
|
||||
|
||||
LABEL crc_ 0
|
||||
ENDBR
|
||||
- mov tmp, len
|
||||
- cmp $128*24, tmp
|
||||
+ cmp $128*24, len
|
||||
jae .Lfull_block
|
||||
- cmp $SMALL_SIZE, tmp
|
||||
+ cmp $SMALL_SIZE, len
|
||||
jae .Lcontinue_block
|
||||
|
||||
#######################################################################
|
||||
@@ -232,30 +227,30 @@ LABEL crc_ 0
|
||||
.Lsmall:
|
||||
test len, len
|
||||
jz .Ldone
|
||||
- mov len_dw, %eax
|
||||
+ mov len, %eax
|
||||
shr $3, %eax
|
||||
jz .Ldo_dword
|
||||
.Ldo_qwords:
|
||||
- crc32q (bufptmp), crc_init
|
||||
+ crc32q (bufptmp), crc_init_q
|
||||
add $8, bufptmp
|
||||
dec %eax
|
||||
jnz .Ldo_qwords
|
||||
.Ldo_dword:
|
||||
- test $4, len_dw
|
||||
+ test $4, len
|
||||
jz .Ldo_word
|
||||
- crc32l (bufptmp), crc_init_dw
|
||||
+ crc32l (bufptmp), crc_init
|
||||
add $4, bufptmp
|
||||
.Ldo_word:
|
||||
- test $2, len_dw
|
||||
+ test $2, len
|
||||
jz .Ldo_byte
|
||||
- crc32w (bufptmp), crc_init_dw
|
||||
+ crc32w (bufptmp), crc_init
|
||||
add $2, bufptmp
|
||||
.Ldo_byte:
|
||||
- test $1, len_dw
|
||||
+ test $1, len
|
||||
jz .Ldone
|
||||
- crc32b (bufptmp), crc_init_dw
|
||||
+ crc32b (bufptmp), crc_init
|
||||
.Ldone:
|
||||
- movq crc_init, %rax
|
||||
+ mov crc_init, %eax
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbx
|
374
debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
vendored
Normal file
374
debian/patches/patchset-pf/crypto/0003-crypto-x86-crc32c-eliminate-jump-table-and-excessive.patch
vendored
Normal file
@@ -0,0 +1,374 @@
|
||||
From 8706bf3e3cba8c708f9933f0d1c6a23f9c2c8c33 Mon Sep 17 00:00:00 2001
|
||||
From: Eric Biggers <ebiggers@google.com>
|
||||
Date: Sun, 13 Oct 2024 21:06:49 -0700
|
||||
Subject: crypto: x86/crc32c - eliminate jump table and excessive unrolling
|
||||
|
||||
crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully
|
||||
unrolled and uses a jump table to jump into the correct location. This
|
||||
optimization is misguided, as it bloats the binary code size and
|
||||
introduces an indirect call. x86_64 CPUs can predict loops well, so it
|
||||
is fine to just use a loop instead. Loop bookkeeping instructions can
|
||||
compete with the crc instructions for the ALUs, but this is easily
|
||||
mitigated by unrolling the loop by a smaller amount, such as 4 times.
|
||||
|
||||
Therefore, re-roll the loop and make related tweaks to the code.
|
||||
|
||||
This reduces the binary code size of crc_pclmul() from 4546 bytes to 418
|
||||
bytes, a 91% reduction. In general it also makes the code faster, with
|
||||
some large improvements seen when retpoline is enabled.
|
||||
|
||||
More detailed performance results are shown below. They are given as
|
||||
percent improvement in throughput (negative means regressed) for CPU
|
||||
microarchitecture vs. input length in bytes. E.g. an improvement from
|
||||
40 GB/s to 50 GB/s would be listed as 25%.
|
||||
|
||||
Table 1: Results with retpoline enabled (the default):
|
||||
|
||||
| 512 | 833 | 1024 | 2000 | 3173 | 4096 |
|
||||
---------------------+-------+-------+-------+------ +-------+-------+
|
||||
Intel Haswell | 35.0% | 20.7% | 17.8% | 9.7% | -0.2% | 4.4% |
|
||||
Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% | 0.0% | 5.4% |
|
||||
AMD Zen 2 | 29.5% | 17.2% | 13.5% | 8.6% | -0.5% | 2.8% |
|
||||
|
||||
Table 2: Results with retpoline disabled:
|
||||
|
||||
| 512 | 833 | 1024 | 2000 | 3173 | 4096 |
|
||||
---------------------+-------+-------+-------+------ +-------+-------+
|
||||
Intel Haswell | 3.3% | 4.8% | 4.5% | 0.9% | -2.9% | 0.3% |
|
||||
Intel Emerald Rapids | 7.5% | 6.4% | 5.2% | 2.3% | -0.0% | 0.6% |
|
||||
AMD Zen 2 | 11.8% | 1.4% | 0.2% | 1.3% | -0.9% | -0.2% |
|
||||
|
||||
Signed-off-by: Eric Biggers <ebiggers@google.com>
|
||||
---
|
||||
arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 233 +++++++++-------------
|
||||
1 file changed, 92 insertions(+), 141 deletions(-)
|
||||
|
||||
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
|
||||
@@ -7,6 +7,7 @@
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
|
||||
*
|
||||
* Copyright (C) 2012 Intel Corporation.
|
||||
+ * Copyright 2024 Google LLC
|
||||
*
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
@@ -44,18 +45,9 @@
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
-#include <asm/nospec-branch.h>
|
||||
|
||||
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
|
||||
|
||||
-.macro LABEL prefix n
|
||||
-.L\prefix\n\():
|
||||
-.endm
|
||||
-
|
||||
-.macro JMPTBL_ENTRY i
|
||||
-.quad .Lcrc_\i
|
||||
-.endm
|
||||
-
|
||||
# Define threshold below which buffers are considered "small" and routed to
|
||||
# regular CRC code that does not interleave the CRC instructions.
|
||||
#define SMALL_SIZE 200
|
||||
@@ -64,139 +56,116 @@
|
||||
|
||||
.text
|
||||
SYM_FUNC_START(crc_pcl)
|
||||
-#define bufp rdi
|
||||
-#define bufp_dw %edi
|
||||
-#define bufp_w %di
|
||||
-#define bufp_b %dil
|
||||
-#define bufptmp %rcx
|
||||
-#define block_0 %rcx
|
||||
-#define block_1 %rdx
|
||||
-#define block_2 %r11
|
||||
-#define len %esi
|
||||
-#define crc_init_arg %edx
|
||||
-#define tmp %rbx
|
||||
-#define crc_init %r8d
|
||||
-#define crc_init_q %r8
|
||||
-#define crc1 %r9
|
||||
-#define crc2 %r10
|
||||
-
|
||||
- pushq %rbx
|
||||
- pushq %rdi
|
||||
- pushq %rsi
|
||||
-
|
||||
- ## Move crc_init for Linux to a different
|
||||
- mov crc_init_arg, crc_init
|
||||
+#define bufp %rdi
|
||||
+#define bufp_d %edi
|
||||
+#define len %esi
|
||||
+#define crc_init %edx
|
||||
+#define crc_init_q %rdx
|
||||
+#define n_misaligned %ecx /* overlaps chunk_bytes! */
|
||||
+#define n_misaligned_q %rcx
|
||||
+#define chunk_bytes %ecx /* overlaps n_misaligned! */
|
||||
+#define chunk_bytes_q %rcx
|
||||
+#define crc1 %r8
|
||||
+#define crc2 %r9
|
||||
|
||||
- mov %bufp, bufptmp # rdi = *buf
|
||||
cmp $SMALL_SIZE, len
|
||||
jb .Lsmall
|
||||
|
||||
################################################################
|
||||
## 1) ALIGN:
|
||||
################################################################
|
||||
- neg %bufp
|
||||
- and $7, %bufp # calculate the unalignment amount of
|
||||
+ mov bufp_d, n_misaligned
|
||||
+ neg n_misaligned
|
||||
+ and $7, n_misaligned # calculate the misalignment amount of
|
||||
# the address
|
||||
- je .Lproc_block # Skip if aligned
|
||||
+ je .Laligned # Skip if aligned
|
||||
|
||||
+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
|
||||
+ # the remaining data to an 8-byte boundary.
|
||||
.Ldo_align:
|
||||
- #### Calculate CRC of unaligned bytes of the buffer (if any)
|
||||
- movq (bufptmp), tmp # load a quadward from the buffer
|
||||
- add %bufp, bufptmp # align buffer pointer for quadword
|
||||
- # processing
|
||||
- sub bufp_dw, len # update buffer length
|
||||
+ movq (bufp), %rax
|
||||
+ add n_misaligned_q, bufp
|
||||
+ sub n_misaligned, len
|
||||
.Lalign_loop:
|
||||
- crc32b %bl, crc_init # compute crc32 of 1-byte
|
||||
- shr $8, tmp # get next byte
|
||||
- dec %bufp
|
||||
+ crc32b %al, crc_init # compute crc32 of 1-byte
|
||||
+ shr $8, %rax # get next byte
|
||||
+ dec n_misaligned
|
||||
jne .Lalign_loop
|
||||
-
|
||||
-.Lproc_block:
|
||||
+.Laligned:
|
||||
|
||||
################################################################
|
||||
- ## 2) PROCESS BLOCKS:
|
||||
+ ## 2) PROCESS BLOCK:
|
||||
################################################################
|
||||
|
||||
- ## compute num of bytes to be processed
|
||||
-
|
||||
cmp $128*24, len
|
||||
jae .Lfull_block
|
||||
|
||||
-.Lcontinue_block:
|
||||
- ## len < 128*24
|
||||
- movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||||
- mul len
|
||||
- shrq $16, %rax
|
||||
-
|
||||
- ## eax contains floor(bytes / 24) = num 24-byte chunks to do
|
||||
-
|
||||
- ## process rax 24-byte chunks (128 >= rax >= 0)
|
||||
-
|
||||
- ## compute end address of each block
|
||||
- ## block 0 (base addr + RAX * 8)
|
||||
- ## block 1 (base addr + RAX * 16)
|
||||
- ## block 2 (base addr + RAX * 24)
|
||||
- lea (bufptmp, %rax, 8), block_0
|
||||
- lea (block_0, %rax, 8), block_1
|
||||
- lea (block_1, %rax, 8), block_2
|
||||
-
|
||||
- xor crc1, crc1
|
||||
- xor crc2, crc2
|
||||
-
|
||||
- ## branch into array
|
||||
- leaq jump_table(%rip), %bufp
|
||||
- mov (%bufp,%rax,8), %bufp
|
||||
- JMP_NOSPEC bufp
|
||||
+.Lpartial_block:
|
||||
+ # Compute floor(len / 24) to get num qwords to process from each lane.
|
||||
+ imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
|
||||
+ shr $16, %eax
|
||||
+ jmp .Lcrc_3lanes
|
||||
|
||||
- ################################################################
|
||||
- ## 2a) PROCESS FULL BLOCKS:
|
||||
- ################################################################
|
||||
.Lfull_block:
|
||||
- movl $128,%eax
|
||||
- lea 128*8*2(block_0), block_1
|
||||
- lea 128*8*3(block_0), block_2
|
||||
- add $128*8*1, block_0
|
||||
-
|
||||
- xor crc1,crc1
|
||||
- xor crc2,crc2
|
||||
-
|
||||
- # Fall through into top of crc array (crc_128)
|
||||
+ # Processing 128 qwords from each lane.
|
||||
+ mov $128, %eax
|
||||
|
||||
################################################################
|
||||
- ## 3) CRC Array:
|
||||
+ ## 3) CRC each of three lanes:
|
||||
################################################################
|
||||
|
||||
- i=128
|
||||
-.rept 128-1
|
||||
-.altmacro
|
||||
-LABEL crc_ %i
|
||||
-.noaltmacro
|
||||
- ENDBR
|
||||
- crc32q -i*8(block_0), crc_init_q
|
||||
- crc32q -i*8(block_1), crc1
|
||||
- crc32q -i*8(block_2), crc2
|
||||
- i=(i-1)
|
||||
-.endr
|
||||
-
|
||||
-.altmacro
|
||||
-LABEL crc_ %i
|
||||
-.noaltmacro
|
||||
- ENDBR
|
||||
- crc32q -i*8(block_0), crc_init_q
|
||||
- crc32q -i*8(block_1), crc1
|
||||
-# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
|
||||
+.Lcrc_3lanes:
|
||||
+ xor crc1,crc1
|
||||
+ xor crc2,crc2
|
||||
+ mov %eax, chunk_bytes
|
||||
+ shl $3, chunk_bytes # num bytes to process from each lane
|
||||
+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter
|
||||
+ jl .Lcrc_3lanes_4x_done
|
||||
+
|
||||
+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
|
||||
+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
|
||||
+.Lcrc_3lanes_4x_loop:
|
||||
+ crc32q (bufp), crc_init_q
|
||||
+ crc32q (bufp,chunk_bytes_q), crc1
|
||||
+ crc32q (bufp,chunk_bytes_q,2), crc2
|
||||
+ crc32q 8(bufp), crc_init_q
|
||||
+ crc32q 8(bufp,chunk_bytes_q), crc1
|
||||
+ crc32q 8(bufp,chunk_bytes_q,2), crc2
|
||||
+ crc32q 16(bufp), crc_init_q
|
||||
+ crc32q 16(bufp,chunk_bytes_q), crc1
|
||||
+ crc32q 16(bufp,chunk_bytes_q,2), crc2
|
||||
+ crc32q 24(bufp), crc_init_q
|
||||
+ crc32q 24(bufp,chunk_bytes_q), crc1
|
||||
+ crc32q 24(bufp,chunk_bytes_q,2), crc2
|
||||
+ add $32, bufp
|
||||
+ sub $4, %eax
|
||||
+ jge .Lcrc_3lanes_4x_loop
|
||||
+
|
||||
+.Lcrc_3lanes_4x_done:
|
||||
+ add $4, %eax
|
||||
+ jz .Lcrc_3lanes_last_qword
|
||||
+
|
||||
+.Lcrc_3lanes_1x_loop:
|
||||
+ crc32q (bufp), crc_init_q
|
||||
+ crc32q (bufp,chunk_bytes_q), crc1
|
||||
+ crc32q (bufp,chunk_bytes_q,2), crc2
|
||||
+ add $8, bufp
|
||||
+ dec %eax
|
||||
+ jnz .Lcrc_3lanes_1x_loop
|
||||
|
||||
- mov block_2, block_0
|
||||
+.Lcrc_3lanes_last_qword:
|
||||
+ crc32q (bufp), crc_init_q
|
||||
+ crc32q (bufp,chunk_bytes_q), crc1
|
||||
+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
|
||||
|
||||
################################################################
|
||||
## 4) Combine three results:
|
||||
################################################################
|
||||
|
||||
- lea (K_table-8)(%rip), %bufp # first entry is for idx 1
|
||||
- shlq $3, %rax # rax *= 8
|
||||
- pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
|
||||
- leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
|
||||
- sub %eax, len # len -= rax*24
|
||||
+ lea (K_table-8)(%rip), %rax # first entry is for idx 1
|
||||
+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
|
||||
+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
|
||||
+ sub %eax, len # len -= chunk_bytes * 3
|
||||
|
||||
movq crc_init_q, %xmm1 # CRC for block 1
|
||||
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
|
||||
@@ -206,20 +175,19 @@ LABEL crc_ %i
|
||||
|
||||
pxor %xmm2,%xmm1
|
||||
movq %xmm1, %rax
|
||||
- xor -i*8(block_2), %rax
|
||||
+ xor (bufp,chunk_bytes_q,2), %rax
|
||||
mov crc2, crc_init_q
|
||||
crc32 %rax, crc_init_q
|
||||
+ lea 8(bufp,chunk_bytes_q,2), bufp
|
||||
|
||||
################################################################
|
||||
- ## 5) Check for end:
|
||||
+ ## 5) If more blocks remain, goto (2):
|
||||
################################################################
|
||||
|
||||
-LABEL crc_ 0
|
||||
- ENDBR
|
||||
cmp $128*24, len
|
||||
- jae .Lfull_block
|
||||
+ jae .Lfull_block
|
||||
cmp $SMALL_SIZE, len
|
||||
- jae .Lcontinue_block
|
||||
+ jae .Lpartial_block
|
||||
|
||||
#######################################################################
|
||||
## 6) Process any remainder without interleaving:
|
||||
@@ -231,47 +199,30 @@ LABEL crc_ 0
|
||||
shr $3, %eax
|
||||
jz .Ldo_dword
|
||||
.Ldo_qwords:
|
||||
- crc32q (bufptmp), crc_init_q
|
||||
- add $8, bufptmp
|
||||
+ crc32q (bufp), crc_init_q
|
||||
+ add $8, bufp
|
||||
dec %eax
|
||||
jnz .Ldo_qwords
|
||||
.Ldo_dword:
|
||||
test $4, len
|
||||
jz .Ldo_word
|
||||
- crc32l (bufptmp), crc_init
|
||||
- add $4, bufptmp
|
||||
+ crc32l (bufp), crc_init
|
||||
+ add $4, bufp
|
||||
.Ldo_word:
|
||||
test $2, len
|
||||
jz .Ldo_byte
|
||||
- crc32w (bufptmp), crc_init
|
||||
- add $2, bufptmp
|
||||
+ crc32w (bufp), crc_init
|
||||
+ add $2, bufp
|
||||
.Ldo_byte:
|
||||
test $1, len
|
||||
jz .Ldone
|
||||
- crc32b (bufptmp), crc_init
|
||||
+ crc32b (bufp), crc_init
|
||||
.Ldone:
|
||||
mov crc_init, %eax
|
||||
- popq %rsi
|
||||
- popq %rdi
|
||||
- popq %rbx
|
||||
RET
|
||||
SYM_FUNC_END(crc_pcl)
|
||||
|
||||
.section .rodata, "a", @progbits
|
||||
- ################################################################
|
||||
- ## jump table Table is 129 entries x 2 bytes each
|
||||
- ################################################################
|
||||
-.align 4
|
||||
-jump_table:
|
||||
- i=0
|
||||
-.rept 129
|
||||
-.altmacro
|
||||
-JMPTBL_ENTRY %i
|
||||
-.noaltmacro
|
||||
- i=i+1
|
||||
-.endr
|
||||
-
|
||||
-
|
||||
################################################################
|
||||
## PCLMULQDQ tables
|
||||
## Table is 128 entries x 2 words (8 bytes) each
|
31
debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch
vendored
Normal file
31
debian/patches/patchset-pf/fixes/0001-arch-Kconfig-Default-to-maximum-amount-of-ASLR-bits.patch
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
From cda0e050fec85635986e9cfe991e26339bf305dc Mon Sep 17 00:00:00 2001
|
||||
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
|
||||
Date: Sat, 13 Jan 2024 15:29:25 +0100
|
||||
Subject: arch/Kconfig: Default to maximum amount of ASLR bits
|
||||
|
||||
To mitigate https://zolutal.github.io/aslrnt/; do this with a patch to
|
||||
avoid having to enable `CONFIG_EXPERT`.
|
||||
---
|
||||
arch/Kconfig | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/arch/Kconfig
|
||||
+++ b/arch/Kconfig
|
||||
@@ -1050,7 +1050,7 @@ config ARCH_MMAP_RND_BITS
|
||||
int "Number of bits to use for ASLR of mmap base address" if EXPERT
|
||||
range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
|
||||
default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
|
||||
- default ARCH_MMAP_RND_BITS_MIN
|
||||
+ default ARCH_MMAP_RND_BITS_MAX
|
||||
depends on HAVE_ARCH_MMAP_RND_BITS
|
||||
help
|
||||
This value can be used to select the number of bits to use to
|
||||
@@ -1084,7 +1084,7 @@ config ARCH_MMAP_RND_COMPAT_BITS
|
||||
int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
|
||||
range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
|
||||
default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
|
||||
- default ARCH_MMAP_RND_COMPAT_BITS_MIN
|
||||
+ default ARCH_MMAP_RND_COMPAT_BITS_MAX
|
||||
depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
|
||||
help
|
||||
This value can be used to select the number of bits to use to
|
112
debian/patches/patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch
vendored
Normal file
112
debian/patches/patchset-pf/fixes/0002-cpufreq-Remove-LATENCY_MULTIPLIER.patch
vendored
Normal file
@@ -0,0 +1,112 @@
|
||||
From b7d96c1f19ef15ea431a8d5d7ab2cad22c35edba Mon Sep 17 00:00:00 2001
|
||||
From: Qais Yousef <qyousef@layalina.io>
|
||||
Date: Sun, 28 Jul 2024 20:26:59 +0100
|
||||
Subject: cpufreq: Remove LATENCY_MULTIPLIER
|
||||
|
||||
The current LATENCY_MULTIPLIER which has been around for nearly 20 years
|
||||
causes rate_limit_us to be always in ms range.
|
||||
|
||||
On M1 mac mini I get 50 and 56us transition latency, but due to the 1000
|
||||
multiplier we end up setting rate_limit_us to 50 and 56ms, which gets
|
||||
capped into 2ms and was 10ms before e13aa799c2a6 ("cpufreq: Change
|
||||
default transition delay to 2ms")
|
||||
|
||||
On Intel I5 system transition latency is 20us but due to the multiplier
|
||||
we end up with 20ms that again is capped to 2ms.
|
||||
|
||||
Given how good modern hardware and how modern workloads require systems
|
||||
to be more responsive to cater for sudden changes in workload (tasks
|
||||
sleeping/wakeup/migrating, uclamp causing a sudden boost or cap) and
|
||||
that 2ms is quarter of the time of 120Hz refresh rate system, drop the
|
||||
old logic in favour of providing 50% headroom.
|
||||
|
||||
rate_limit_us = 1.5 * latency.
|
||||
|
||||
I considered not adding any headroom which could mean that we can end up
|
||||
with infinite back-to-back requests.
|
||||
|
||||
I also considered providing a constant headroom (e.g: 100us) assuming
|
||||
that any h/w or f/w dealing with the request shouldn't require a large
|
||||
headroom when transition_latency is actually high.
|
||||
|
||||
But for both cases I wasn't sure if h/w or f/w can end up being
|
||||
overwhelmed dealing with the freq requests in a potentially busy system.
|
||||
So I opted for providing 50% breathing room.
|
||||
|
||||
This is expected to impact schedutil only as the other user,
|
||||
dbs_governor, takes the max(2*tick, transition_delay_us) and the former
|
||||
was at least 2ms on 1ms TICK, which is equivalent to the max_delay_us
|
||||
before applying this patch. For systems with TICK of 4ms, this value
|
||||
would have almost always ended up with 8ms sampling rate.
|
||||
|
||||
For systems that report 0 transition latency, we still default to
|
||||
returning 1ms as transition delay.
|
||||
|
||||
This helps in eliminating a source of latency for applying requests as
|
||||
mentioned in [1]. For example if we have a 1ms tick, most systems will
|
||||
miss sending an update at tick when updating the util_avg for a task/CPU
|
||||
(rate_limit_us will be 2ms for most systems).
|
||||
|
||||
Link: https://lore.kernel.org/lkml/20240724212255.mfr2ybiv2j2uqek7@airbuntu/ # [1]
|
||||
Link: https://lore.kernel.org/lkml/20240205022500.2232124-1-qyousef@layalina.io/
|
||||
Signed-off-by: Qais Yousef <qyousef@layalina.io>
|
||||
Link: https://patch.msgid.link/20240728192659.58115-1-qyousef@layalina.io
|
||||
[ rjw: Subject edits ]
|
||||
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
---
|
||||
drivers/cpufreq/cpufreq.c | 27 ++++-----------------------
|
||||
include/linux/cpufreq.h | 6 ------
|
||||
2 files changed, 4 insertions(+), 29 deletions(-)
|
||||
|
||||
--- a/drivers/cpufreq/cpufreq.c
|
||||
+++ b/drivers/cpufreq/cpufreq.c
|
||||
@@ -575,30 +575,11 @@ unsigned int cpufreq_policy_transition_d
|
||||
return policy->transition_delay_us;
|
||||
|
||||
latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
|
||||
- if (latency) {
|
||||
- unsigned int max_delay_us = 2 * MSEC_PER_SEC;
|
||||
+ if (latency)
|
||||
+ /* Give a 50% breathing room between updates */
|
||||
+ return latency + (latency >> 1);
|
||||
|
||||
- /*
|
||||
- * If the platform already has high transition_latency, use it
|
||||
- * as-is.
|
||||
- */
|
||||
- if (latency > max_delay_us)
|
||||
- return latency;
|
||||
-
|
||||
- /*
|
||||
- * For platforms that can change the frequency very fast (< 2
|
||||
- * us), the above formula gives a decent transition delay. But
|
||||
- * for platforms where transition_latency is in milliseconds, it
|
||||
- * ends up giving unrealistic values.
|
||||
- *
|
||||
- * Cap the default transition delay to 2 ms, which seems to be
|
||||
- * a reasonable amount of time after which we should reevaluate
|
||||
- * the frequency.
|
||||
- */
|
||||
- return min(latency * LATENCY_MULTIPLIER, max_delay_us);
|
||||
- }
|
||||
-
|
||||
- return LATENCY_MULTIPLIER;
|
||||
+ return USEC_PER_MSEC;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);
|
||||
|
||||
--- a/include/linux/cpufreq.h
|
||||
+++ b/include/linux/cpufreq.h
|
||||
@@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scal
|
||||
#define CPUFREQ_POLICY_POWERSAVE (1)
|
||||
#define CPUFREQ_POLICY_PERFORMANCE (2)
|
||||
|
||||
-/*
|
||||
- * The polling frequency depends on the capability of the processor. Default
|
||||
- * polling frequency is 1000 times the transition latency of the processor.
|
||||
- */
|
||||
-#define LATENCY_MULTIPLIER (1000)
|
||||
-
|
||||
struct cpufreq_governor {
|
||||
char name[CPUFREQ_NAME_LEN];
|
||||
int (*init)(struct cpufreq_policy *policy);
|
83
debian/patches/patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch
vendored
Normal file
83
debian/patches/patchset-pf/fixes/0003-drivers-firmware-skip-simpledrm-if-nvidia-drm.modese.patch
vendored
Normal file
@@ -0,0 +1,83 @@
|
||||
From 218e958524c673d6e68737e7f82d80ba2b6ef59a Mon Sep 17 00:00:00 2001
|
||||
From: Javier Martinez Canillas <javierm@redhat.com>
|
||||
Date: Thu, 19 May 2022 14:40:07 +0200
|
||||
Subject: drivers/firmware: skip simpledrm if nvidia-drm.modeset=1 is set
|
||||
|
||||
The Nvidia proprietary driver has some bugs that leads to issues if used
|
||||
with the simpledrm driver. The most noticeable is that does not register
|
||||
an emulated fbdev device.
|
||||
|
||||
It just relies on a fbdev to be registered by another driver, that could
|
||||
be that could be attached to the framebuffer console. On UEFI machines,
|
||||
this is the efifb driver.
|
||||
|
||||
This means that disabling the efifb driver will cause virtual consoles to
|
||||
not be present in the system when using the Nvidia driver. Legacy BIOS is
|
||||
not affected just because fbcon is not used there, but instead vgacon.
|
||||
|
||||
Unless a VGA mode is specified using the vga= kernel command line option,
|
||||
in that case the vesafb driver is used instead and its fbdev attached to
|
||||
the fbcon.
|
||||
|
||||
This is a problem because with CONFIG_SYSFB_SIMPLEFB=y, the sysfb platform
|
||||
code attempts to register a "simple-framebuffer" platform device (that is
|
||||
matched against simpledrm) and only registers either an "efi-framebuffer"
|
||||
or "vesa-framebuffer" if this fails to be registered due the video modes
|
||||
not being compatible.
|
||||
|
||||
The Nvidia driver relying on another driver to register the fbdev is quite
|
||||
fragile, since it can't really assume those will stick around. For example
|
||||
there are patches posted to remove the EFI and VESA platform devices once
|
||||
a real DRM or fbdev driver probes.
|
||||
|
||||
But in any case, moving to a simpledrm + emulated fbdev only breaks this
|
||||
assumption and causes users to not have VT if the Nvidia driver is used.
|
||||
|
||||
So to prevent this, let's add a workaround and make the sysfb to skip the
|
||||
"simple-framebuffer" registration when nvidia-drm.modeset=1 option is set.
|
||||
|
||||
This is quite horrible, but honestly I can't think of any other approach.
|
||||
|
||||
For this to work, the CONFIG_FB_EFI and CONFIG_FB_VESA config options must
|
||||
be enabled besides CONFIG_DRM_SIMPLEDRM.
|
||||
|
||||
Signed-off-by: Javier Martinez Canillas <javierm@redhat.com>
|
||||
Cherry-picked-for: https://bugs.archlinux.org/task/73720
|
||||
---
|
||||
drivers/firmware/sysfb.c | 18 +++++++++++++++++-
|
||||
1 file changed, 17 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/firmware/sysfb.c
|
||||
+++ b/drivers/firmware/sysfb.c
|
||||
@@ -35,6 +35,22 @@
|
||||
#include <linux/screen_info.h>
|
||||
#include <linux/sysfb.h>
|
||||
|
||||
+static int skip_simpledrm;
|
||||
+
|
||||
+static int __init simpledrm_disable(char *opt)
|
||||
+{
|
||||
+ if (!opt)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ get_option(&opt, &skip_simpledrm);
|
||||
+
|
||||
+ if (skip_simpledrm)
|
||||
+ pr_info("The simpledrm driver will not be probed\n");
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+early_param("nvidia-drm.modeset", simpledrm_disable);
|
||||
+
|
||||
static struct platform_device *pd;
|
||||
static DEFINE_MUTEX(disable_lock);
|
||||
static bool disabled;
|
||||
@@ -145,7 +161,7 @@ static __init int sysfb_init(void)
|
||||
|
||||
/* try to create a simple-framebuffer device */
|
||||
compatible = sysfb_parse_mode(si, &mode);
|
||||
- if (compatible) {
|
||||
+ if (compatible && !skip_simpledrm) {
|
||||
pd = sysfb_create_simplefb(si, &mode, parent);
|
||||
if (!IS_ERR(pd))
|
||||
goto put_device;
|
26
debian/patches/patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch
vendored
Normal file
26
debian/patches/patchset-pf/fixes/0004-nfsd-add-more-info-to-WARN_ON_ONCE-on-failed-callbac.patch
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
From b97d21a0aa65a6f7a7bb17bbc696b136688c96ed Mon Sep 17 00:00:00 2001
|
||||
From: Jeff Layton <jlayton@kernel.org>
|
||||
Date: Mon, 26 Aug 2024 08:50:11 -0400
|
||||
Subject: nfsd: add more info to WARN_ON_ONCE on failed callbacks
|
||||
|
||||
Currently, you get the warning and stack trace, but nothing is printed
|
||||
about the relevant error codes. Add that in.
|
||||
|
||||
Signed-off-by: Jeff Layton <jlayton@kernel.org>
|
||||
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
|
||||
---
|
||||
fs/nfsd/nfs4callback.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/nfsd/nfs4callback.c
|
||||
+++ b/fs/nfsd/nfs4callback.c
|
||||
@@ -1333,7 +1333,8 @@ static void nfsd4_cb_done(struct rpc_tas
|
||||
return;
|
||||
|
||||
if (cb->cb_status) {
|
||||
- WARN_ON_ONCE(task->tk_status);
|
||||
+ WARN_ONCE(task->tk_status, "cb_status=%d tk_status=%d",
|
||||
+ cb->cb_status, task->tk_status);
|
||||
task->tk_status = cb->cb_status;
|
||||
}
|
||||
|
57
debian/patches/patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch
vendored
Normal file
57
debian/patches/patchset-pf/fixes/0005-e1000e-Remove-Meteor-Lake-SMBUS-workarounds.patch
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
From 1d120544580708eae6bd5981b308ca17735edaac Mon Sep 17 00:00:00 2001
|
||||
From: Vitaly Lifshits <vitaly.lifshits@intel.com>
|
||||
Date: Tue, 1 Oct 2024 20:08:48 +0300
|
||||
Subject: e1000e: Remove Meteor Lake SMBUS workarounds
|
||||
|
||||
This is a partial revert to commit 76a0a3f9cc2f ("e1000e: fix force smbus
|
||||
during suspend flow"). That commit fixed a sporadic PHY access issue but
|
||||
introduced a regression in runtime suspend flows.
|
||||
The original issue on Meteor Lake systems was rare in terms of the
|
||||
reproduction rate and the number of the systems affected.
|
||||
|
||||
After the integration of commit 0a6ad4d9e169 ("e1000e: avoid failing the
|
||||
system during pm_suspend"), PHY access loss can no longer cause a
|
||||
system-level suspend failure. As it only occurs when the LAN cable is
|
||||
disconnected, and is recovered during system resume flow. Therefore, its
|
||||
functional impact is low, and the priority is given to stabilizing
|
||||
runtime suspend.
|
||||
|
||||
Fixes: 76a0a3f9cc2f ("e1000e: fix force smbus during suspend flow")
|
||||
Signed-off-by: Vitaly Lifshits <vitaly.lifshits@intel.com>
|
||||
---
|
||||
drivers/net/ethernet/intel/e1000e/ich8lan.c | 17 ++++-------------
|
||||
1 file changed, 4 insertions(+), 13 deletions(-)
|
||||
|
||||
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
|
||||
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
|
||||
@@ -1205,12 +1205,10 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000
|
||||
if (ret_val)
|
||||
goto out;
|
||||
|
||||
- if (hw->mac.type != e1000_pch_mtp) {
|
||||
- ret_val = e1000e_force_smbus(hw);
|
||||
- if (ret_val) {
|
||||
- e_dbg("Failed to force SMBUS: %d\n", ret_val);
|
||||
- goto release;
|
||||
- }
|
||||
+ ret_val = e1000e_force_smbus(hw);
|
||||
+ if (ret_val) {
|
||||
+ e_dbg("Failed to force SMBUS: %d\n", ret_val);
|
||||
+ goto release;
|
||||
}
|
||||
|
||||
/* Si workaround for ULP entry flow on i127/rev6 h/w. Enable
|
||||
@@ -1273,13 +1271,6 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000
|
||||
}
|
||||
|
||||
release:
|
||||
- if (hw->mac.type == e1000_pch_mtp) {
|
||||
- ret_val = e1000e_force_smbus(hw);
|
||||
- if (ret_val)
|
||||
- e_dbg("Failed to force SMBUS over MTL system: %d\n",
|
||||
- ret_val);
|
||||
- }
|
||||
-
|
||||
hw->phy.ops.release(hw);
|
||||
out:
|
||||
if (ret_val)
|
46
debian/patches/patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch
vendored
Normal file
46
debian/patches/patchset-pf/fixes/0006-btrfs-zoned-fix-zone-unusable-accounting-for-freed-r.patch
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
From 4086c1a804741c9c8f418d6088e8c531f2a481f3 Mon Sep 17 00:00:00 2001
|
||||
From: Naohiro Aota <naohiro.aota@wdc.com>
|
||||
Date: Tue, 1 Oct 2024 17:03:32 +0900
|
||||
Subject: btrfs: zoned: fix zone unusable accounting for freed reserved extent
|
||||
|
||||
When btrfs reserves an extent and does not use it (e.g, by an error), it
|
||||
calls btrfs_free_reserved_extent() to free the reserved extent. In the
|
||||
process, it calls btrfs_add_free_space() and then it accounts the region
|
||||
bytes as block_group->zone_unusable.
|
||||
|
||||
However, it leaves the space_info->bytes_zone_unusable side not updated. As
|
||||
a result, ENOSPC can happen while a space_info reservation succeeded. The
|
||||
reservation is fine because the freed region is not added in
|
||||
space_info->bytes_zone_unusable, leaving that space as "free". OTOH,
|
||||
corresponding block group counts it as zone_unusable and its allocation
|
||||
pointer is not rewound, we cannot allocate an extent from that block group.
|
||||
That will also negate space_info's async/sync reclaim process, and cause an
|
||||
ENOSPC error from the extent allocation process.
|
||||
|
||||
Fix that by returning the space to space_info->bytes_zone_unusable.
|
||||
Ideally, since a bio is not submitted for this reserved region, we should
|
||||
return the space to free space and rewind the allocation pointer. But, it
|
||||
needs rework on extent allocation handling, so let it work in this way for
|
||||
now.
|
||||
|
||||
Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones")
|
||||
CC: stable@vger.kernel.org # 5.15+
|
||||
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
|
||||
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
|
||||
Reviewed-by: David Sterba <dsterba@suse.com>
|
||||
Signed-off-by: David Sterba <dsterba@suse.com>
|
||||
---
|
||||
fs/btrfs/block-group.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/fs/btrfs/block-group.c
|
||||
+++ b/fs/btrfs/block-group.c
|
||||
@@ -3819,6 +3819,8 @@ void btrfs_free_reserved_bytes(struct bt
|
||||
spin_lock(&cache->lock);
|
||||
if (cache->ro)
|
||||
space_info->bytes_readonly += num_bytes;
|
||||
+ else if (btrfs_is_zoned(cache->fs_info))
|
||||
+ space_info->bytes_zone_unusable += num_bytes;
|
||||
cache->reserved -= num_bytes;
|
||||
space_info->bytes_reserved -= num_bytes;
|
||||
space_info->max_extent_size = 0;
|
64
debian/patches/patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch
vendored
Normal file
64
debian/patches/patchset-pf/fixes/0007-btrfs-clear-force-compress-on-remount-when-compress-.patch
vendored
Normal file
@@ -0,0 +1,64 @@
|
||||
From aa8155f0ba032729ec4f28c5cb9669fb14f6947b Mon Sep 17 00:00:00 2001
|
||||
From: Filipe Manana <fdmanana@suse.com>
|
||||
Date: Mon, 14 Oct 2024 16:14:18 +0100
|
||||
Subject: btrfs: clear force-compress on remount when compress mount option is
|
||||
given
|
||||
|
||||
After the migration to use fs context for processing mount options we had
|
||||
a slight change in the semantics for remounting a filesystem that was
|
||||
mounted with compress-force. Before we could clear compress-force by
|
||||
passing only "-o compress[=algo]" during a remount, but after that change
|
||||
that does not work anymore, force-compress is still present and one needs
|
||||
to pass "-o compress-force=no,compress[=algo]" to the mount command.
|
||||
|
||||
Example, when running on a kernel 6.8+:
|
||||
|
||||
$ mount -o compress-force=zlib:9 /dev/sdi /mnt/sdi
|
||||
$ mount | grep sdi
|
||||
/dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:9,discard=async,space_cache=v2,subvolid=5,subvol=/)
|
||||
|
||||
$ mount -o remount,compress=zlib:5 /mnt/sdi
|
||||
$ mount | grep sdi
|
||||
/dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:5,discard=async,space_cache=v2,subvolid=5,subvol=/)
|
||||
|
||||
On a 6.7 kernel (or older):
|
||||
|
||||
$ mount -o compress-force=zlib:9 /dev/sdi /mnt/sdi
|
||||
$ mount | grep sdi
|
||||
/dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress-force=zlib:9,discard=async,space_cache=v2,subvolid=5,subvol=/)
|
||||
|
||||
$ mount -o remount,compress=zlib:5 /mnt/sdi
|
||||
$ mount | grep sdi
|
||||
/dev/sdi on /mnt/sdi type btrfs (rw,relatime,compress=zlib:5,discard=async,space_cache=v2,subvolid=5,subvol=/)
|
||||
|
||||
So update btrfs_parse_param() to clear "compress-force" when "compress" is
|
||||
given, providing the same semantics as kernel 6.7 and older.
|
||||
|
||||
Reported-by: Roman Mamedov <rm@romanrm.net>
|
||||
Link: https://lore.kernel.org/linux-btrfs/20241014182416.13d0f8b0@nvm/
|
||||
CC: stable@vger.kernel.org # 6.8+
|
||||
Signed-off-by: Filipe Manana <fdmanana@suse.com>
|
||||
Reviewed-by: David Sterba <dsterba@suse.com>
|
||||
Signed-off-by: David Sterba <dsterba@suse.com>
|
||||
---
|
||||
fs/btrfs/super.c | 9 +++++++++
|
||||
1 file changed, 9 insertions(+)
|
||||
|
||||
--- a/fs/btrfs/super.c
|
||||
+++ b/fs/btrfs/super.c
|
||||
@@ -340,6 +340,15 @@ static int btrfs_parse_param(struct fs_c
|
||||
fallthrough;
|
||||
case Opt_compress:
|
||||
case Opt_compress_type:
|
||||
+ /*
|
||||
+ * Provide the same semantics as older kernels that don't use fs
|
||||
+ * context, specifying the "compress" option clears
|
||||
+ * "force-compress" without the need to pass
|
||||
+ * "compress-force=[no|none]" before specifying "compress".
|
||||
+ */
|
||||
+ if (opt != Opt_compress_force && opt != Opt_compress_force_type)
|
||||
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
|
||||
+
|
||||
if (opt == Opt_compress || opt == Opt_compress_force) {
|
||||
ctx->compress_type = BTRFS_COMPRESS_ZLIB;
|
||||
ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
|
68
debian/patches/patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch
vendored
Normal file
68
debian/patches/patchset-pf/fixes/0008-btrfs-qgroup-set-a-more-sane-default-value-for-subtr.patch
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
From 81baeb2a67d8245ac5b61299e54dd65defd4ac72 Mon Sep 17 00:00:00 2001
|
||||
From: Qu Wenruo <wqu@suse.com>
|
||||
Date: Tue, 10 Sep 2024 15:21:04 +0930
|
||||
Subject: btrfs: qgroup: set a more sane default value for subtree drop
|
||||
threshold
|
||||
|
||||
Since commit 011b46c30476 ("btrfs: skip subtree scan if it's too high to
|
||||
avoid low stall in btrfs_commit_transaction()"), btrfs qgroup can
|
||||
automatically skip large subtree scan at the cost of marking qgroup
|
||||
inconsistent.
|
||||
|
||||
It's designed to address the final performance problem of snapshot drop
|
||||
with qgroup enabled, but to be safe the default value is
|
||||
BTRFS_MAX_LEVEL, requiring a user space daemon to set a different value
|
||||
to make it work.
|
||||
|
||||
I'd say it's not a good idea to rely on user space tool to set this
|
||||
default value, especially when some operations (snapshot dropping) can
|
||||
be triggered immediately after mount, leaving a very small window to
|
||||
that that sysfs interface.
|
||||
|
||||
So instead of disabling this new feature by default, enable it with a
|
||||
low threshold (3), so that large subvolume tree drop at mount time won't
|
||||
cause huge qgroup workload.
|
||||
|
||||
CC: stable@vger.kernel.org # 6.1
|
||||
Signed-off-by: Qu Wenruo <wqu@suse.com>
|
||||
Reviewed-by: David Sterba <dsterba@suse.com>
|
||||
Signed-off-by: David Sterba <dsterba@suse.com>
|
||||
---
|
||||
fs/btrfs/disk-io.c | 2 +-
|
||||
fs/btrfs/qgroup.c | 2 +-
|
||||
fs/btrfs/qgroup.h | 2 ++
|
||||
3 files changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/fs/btrfs/disk-io.c
|
||||
+++ b/fs/btrfs/disk-io.c
|
||||
@@ -1960,7 +1960,7 @@ static void btrfs_init_qgroup(struct btr
|
||||
fs_info->qgroup_seq = 1;
|
||||
fs_info->qgroup_ulist = NULL;
|
||||
fs_info->qgroup_rescan_running = false;
|
||||
- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
|
||||
+ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
|
||||
mutex_init(&fs_info->qgroup_rescan_lock);
|
||||
}
|
||||
|
||||
--- a/fs/btrfs/qgroup.c
|
||||
+++ b/fs/btrfs/qgroup.c
|
||||
@@ -1407,7 +1407,7 @@ int btrfs_quota_disable(struct btrfs_fs_
|
||||
fs_info->quota_root = NULL;
|
||||
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
|
||||
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
|
||||
- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
|
||||
+ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
|
||||
spin_unlock(&fs_info->qgroup_lock);
|
||||
|
||||
btrfs_free_qgroup_config(fs_info);
|
||||
--- a/fs/btrfs/qgroup.h
|
||||
+++ b/fs/btrfs/qgroup.h
|
||||
@@ -121,6 +121,8 @@ struct btrfs_inode;
|
||||
#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63)
|
||||
#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62)
|
||||
|
||||
+#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3)
|
||||
+
|
||||
/*
|
||||
* Record a dirty extent, and info qgroup to update quota on it
|
||||
*/
|
32
debian/patches/patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch
vendored
Normal file
32
debian/patches/patchset-pf/fixes/0009-btrfs-also-add-stripe-entries-for-NOCOW-writes.patch
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
From 8ea93b01558ea7a752e478ad25862e7441d6053a Mon Sep 17 00:00:00 2001
|
||||
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
|
||||
Date: Thu, 19 Sep 2024 12:16:38 +0200
|
||||
Subject: btrfs: also add stripe entries for NOCOW writes
|
||||
|
||||
NOCOW writes do not generate stripe_extent entries in the RAID stripe
|
||||
tree, as the RAID stripe-tree feature initially was designed with a
|
||||
zoned filesystem in mind and on a zoned filesystem, we do not allow NOCOW
|
||||
writes. But the RAID stripe-tree feature is independent from the zoned
|
||||
feature, so we must also do NOCOW writes for RAID stripe-tree filesystems.
|
||||
|
||||
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
|
||||
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
|
||||
Signed-off-by: David Sterba <dsterba@suse.com>
|
||||
---
|
||||
fs/btrfs/inode.c | 5 +++++
|
||||
1 file changed, 5 insertions(+)
|
||||
|
||||
--- a/fs/btrfs/inode.c
|
||||
+++ b/fs/btrfs/inode.c
|
||||
@@ -3087,6 +3087,11 @@ int btrfs_finish_one_ordered(struct btrf
|
||||
ret = btrfs_update_inode_fallback(trans, inode);
|
||||
if (ret) /* -ENOMEM or corruption */
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
+
|
||||
+ ret = btrfs_insert_raid_extent(trans, ordered_extent);
|
||||
+ if (ret)
|
||||
+ btrfs_abort_transaction(trans, ret);
|
||||
+
|
||||
goto out;
|
||||
}
|
||||
|
107
debian/patches/patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch
vendored
Normal file
107
debian/patches/patchset-pf/fixes/0010-btrfs-fix-read-corruption-due-to-race-with-extent-ma.patch
vendored
Normal file
@@ -0,0 +1,107 @@
|
||||
From f6f5cd12972307324de5decd7fa41b0b3c98639c Mon Sep 17 00:00:00 2001
|
||||
From: Boris Burkov <boris@bur.io>
|
||||
Date: Fri, 18 Oct 2024 15:44:34 -0700
|
||||
Subject: btrfs: fix read corruption due to race with extent map merging
|
||||
|
||||
In debugging some corrupt squashfs files, we observed symptoms of
|
||||
corrupt page cache pages but correct on-disk contents. Further
|
||||
investigation revealed that the exact symptom was a correct page
|
||||
followed by an incorrect, duplicate, page. This got us thinking about
|
||||
extent maps.
|
||||
|
||||
commit ac05ca913e9f ("Btrfs: fix race between using extent maps and merging them")
|
||||
enforces a reference count on the primary `em` extent_map being merged,
|
||||
as that one gets modified.
|
||||
|
||||
However, since,
|
||||
commit 3d2ac9922465 ("btrfs: introduce new members for extent_map")
|
||||
both 'em' and 'merge' get modified, which started modifying 'merge'
|
||||
and thus introduced the same race.
|
||||
|
||||
We were able to reproduce this by looping the affected squashfs workload
|
||||
in parallel on a bunch of separate btrfs-es while also dropping caches.
|
||||
We are still working on a simple enough reproducer to make into an fstest.
|
||||
|
||||
The simplest fix is to stop modifying 'merge', which is not essential,
|
||||
as it is dropped immediately after the merge. This behavior is simply
|
||||
a consequence of the order of the two extent maps being important in
|
||||
computing the new values. Modify merge_ondisk_extents to take prev and
|
||||
next by const* and also take a third merged parameter that it puts the
|
||||
results in. Note that this introduces the rather odd behavior of passing
|
||||
'em' to merge_ondisk_extents as a const * and as a regular ptr.
|
||||
|
||||
Fixes: 3d2ac9922465 ("btrfs: introduce new members for extent_map")
|
||||
CC: stable@vger.kernel.org # 6.11+
|
||||
Reviewed-by: Qu Wenruo <wqu@suse.com>
|
||||
Reviewed-by: Filipe Manana <fdmanana@suse.com>
|
||||
Signed-off-by: Omar Sandoval <osandov@fb.com>
|
||||
Signed-off-by: Boris Burkov <boris@bur.io>
|
||||
Signed-off-by: David Sterba <dsterba@suse.com>
|
||||
---
|
||||
fs/btrfs/extent_map.c | 31 ++++++++++++++++---------------
|
||||
1 file changed, 16 insertions(+), 15 deletions(-)
|
||||
|
||||
--- a/fs/btrfs/extent_map.c
|
||||
+++ b/fs/btrfs/extent_map.c
|
||||
@@ -240,13 +240,19 @@ static bool mergeable_maps(const struct
|
||||
/*
|
||||
* Handle the on-disk data extents merge for @prev and @next.
|
||||
*
|
||||
+ * @prev: left extent to merge
|
||||
+ * @next: right extent to merge
|
||||
+ * @merged: the extent we will not discard after the merge; updated with new values
|
||||
+ *
|
||||
+ * After this, one of the two extents is the new merged extent and the other is
|
||||
+ * removed from the tree and likely freed. Note that @merged is one of @prev/@next
|
||||
+ * so there is const/non-const aliasing occurring here.
|
||||
+ *
|
||||
* Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
|
||||
* For now only uncompressed regular extent can be merged.
|
||||
- *
|
||||
- * @prev and @next will be both updated to point to the new merged range.
|
||||
- * Thus one of them should be removed by the caller.
|
||||
*/
|
||||
-static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next)
|
||||
+static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next,
|
||||
+ struct extent_map *merged)
|
||||
{
|
||||
u64 new_disk_bytenr;
|
||||
u64 new_disk_num_bytes;
|
||||
@@ -281,15 +287,10 @@ static void merge_ondisk_extents(struct
|
||||
new_disk_bytenr;
|
||||
new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr;
|
||||
|
||||
- prev->disk_bytenr = new_disk_bytenr;
|
||||
- prev->disk_num_bytes = new_disk_num_bytes;
|
||||
- prev->ram_bytes = new_disk_num_bytes;
|
||||
- prev->offset = new_offset;
|
||||
-
|
||||
- next->disk_bytenr = new_disk_bytenr;
|
||||
- next->disk_num_bytes = new_disk_num_bytes;
|
||||
- next->ram_bytes = new_disk_num_bytes;
|
||||
- next->offset = new_offset;
|
||||
+ merged->disk_bytenr = new_disk_bytenr;
|
||||
+ merged->disk_num_bytes = new_disk_num_bytes;
|
||||
+ merged->ram_bytes = new_disk_num_bytes;
|
||||
+ merged->offset = new_offset;
|
||||
}
|
||||
|
||||
static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
|
||||
@@ -358,7 +359,7 @@ static void try_merge_map(struct btrfs_i
|
||||
em->generation = max(em->generation, merge->generation);
|
||||
|
||||
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
|
||||
- merge_ondisk_extents(merge, em);
|
||||
+ merge_ondisk_extents(merge, em, em);
|
||||
em->flags |= EXTENT_FLAG_MERGED;
|
||||
|
||||
validate_extent_map(fs_info, em);
|
||||
@@ -375,7 +376,7 @@ static void try_merge_map(struct btrfs_i
|
||||
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
|
||||
em->len += merge->len;
|
||||
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
|
||||
- merge_ondisk_extents(em, merge);
|
||||
+ merge_ondisk_extents(em, merge, em);
|
||||
validate_extent_map(fs_info, em);
|
||||
rb_erase(&merge->rb_node, &tree->root);
|
||||
RB_CLEAR_NODE(&merge->rb_node);
|
101
debian/patches/patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch
vendored
Normal file
101
debian/patches/patchset-pf/fixes/0011-btrfs-reject-ro-rw-reconfiguration-if-there-are-hard.patch
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
From 7f83049bda761f340991af8dce79a4e98c62b378 Mon Sep 17 00:00:00 2001
|
||||
From: Qu Wenruo <wqu@suse.com>
|
||||
Date: Thu, 19 Sep 2024 20:18:11 +0930
|
||||
Subject: btrfs: reject ro->rw reconfiguration if there are hard ro
|
||||
requirements
|
||||
|
||||
[BUG]
|
||||
Syzbot reports the following crash:
|
||||
|
||||
BTRFS info (device loop0 state MCS): disabling free space tree
|
||||
BTRFS info (device loop0 state MCS): clearing compat-ro feature flag for FREE_SPACE_TREE (0x1)
|
||||
BTRFS info (device loop0 state MCS): clearing compat-ro feature flag for FREE_SPACE_TREE_VALID (0x2)
|
||||
Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN NOPTI
|
||||
KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f]
|
||||
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014
|
||||
RIP: 0010:backup_super_roots fs/btrfs/disk-io.c:1691 [inline]
|
||||
RIP: 0010:write_all_supers+0x97a/0x40f0 fs/btrfs/disk-io.c:4041
|
||||
Call Trace:
|
||||
<TASK>
|
||||
btrfs_commit_transaction+0x1eae/0x3740 fs/btrfs/transaction.c:2530
|
||||
btrfs_delete_free_space_tree+0x383/0x730 fs/btrfs/free-space-tree.c:1312
|
||||
btrfs_start_pre_rw_mount+0xf28/0x1300 fs/btrfs/disk-io.c:3012
|
||||
btrfs_remount_rw fs/btrfs/super.c:1309 [inline]
|
||||
btrfs_reconfigure+0xae6/0x2d40 fs/btrfs/super.c:1534
|
||||
btrfs_reconfigure_for_mount fs/btrfs/super.c:2020 [inline]
|
||||
btrfs_get_tree_subvol fs/btrfs/super.c:2079 [inline]
|
||||
btrfs_get_tree+0x918/0x1920 fs/btrfs/super.c:2115
|
||||
vfs_get_tree+0x90/0x2b0 fs/super.c:1800
|
||||
do_new_mount+0x2be/0xb40 fs/namespace.c:3472
|
||||
do_mount fs/namespace.c:3812 [inline]
|
||||
__do_sys_mount fs/namespace.c:4020 [inline]
|
||||
__se_sys_mount+0x2d6/0x3c0 fs/namespace.c:3997
|
||||
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
|
||||
do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
|
||||
entry_SYSCALL_64_after_hwframe+0x77/0x7f
|
||||
|
||||
[CAUSE]
|
||||
To support mounting different subvolume with different RO/RW flags for
|
||||
the new mount APIs, btrfs introduced two workaround to support this feature:
|
||||
|
||||
- Skip mount option/feature checks if we are mounting a different
|
||||
subvolume
|
||||
|
||||
- Reconfigure the fs to RW if the initial mount is RO
|
||||
|
||||
Combining these two, we can have the following sequence:
|
||||
|
||||
- Mount the fs ro,rescue=all,clear_cache,space_cache=v1
|
||||
rescue=all will mark the fs as hard read-only, so no v2 cache clearing
|
||||
will happen.
|
||||
|
||||
- Mount a subvolume rw of the same fs.
|
||||
We go into btrfs_get_tree_subvol(), but fc_mount() returns EBUSY
|
||||
because our new fc is RW, different from the original fs.
|
||||
|
||||
Now we enter btrfs_reconfigure_for_mount(), which switches the RO flag
|
||||
first so that we can grab the existing fs_info.
|
||||
Then we reconfigure the fs to RW.
|
||||
|
||||
- During reconfiguration, option/features check is skipped
|
||||
This means we will restart the v2 cache clearing, and convert back to
|
||||
v1 cache.
|
||||
This will trigger fs writes, and since the original fs has "rescue=all"
|
||||
option, it skips the csum tree read.
|
||||
|
||||
And eventually causing NULL pointer dereference in super block
|
||||
writeback.
|
||||
|
||||
[FIX]
|
||||
For reconfiguration caused by different subvolume RO/RW flags, ensure we
|
||||
always run btrfs_check_options() to ensure we have proper hard RO
|
||||
requirements met.
|
||||
|
||||
In fact the function btrfs_check_options() doesn't really do many
|
||||
complex checks, but hard RO requirement and some feature dependency
|
||||
checks, thus there is no special reason not to do the check for mount
|
||||
reconfiguration.
|
||||
|
||||
Reported-by: syzbot+56360f93efa90ff15870@syzkaller.appspotmail.com
|
||||
Link: https://lore.kernel.org/linux-btrfs/0000000000008c5d090621cb2770@google.com/
|
||||
Fixes: f044b318675f ("btrfs: handle the ro->rw transition for mounting different subvolumes")
|
||||
CC: stable@vger.kernel.org # 6.8+
|
||||
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
|
||||
Signed-off-by: Qu Wenruo <wqu@suse.com>
|
||||
Signed-off-by: David Sterba <dsterba@suse.com>
|
||||
---
|
||||
fs/btrfs/super.c | 3 +--
|
||||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
--- a/fs/btrfs/super.c
|
||||
+++ b/fs/btrfs/super.c
|
||||
@@ -1519,8 +1519,7 @@ static int btrfs_reconfigure(struct fs_c
|
||||
sync_filesystem(sb);
|
||||
set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
|
||||
|
||||
- if (!mount_reconfigure &&
|
||||
- !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
|
||||
+ if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
|
||||
return -EINVAL;
|
||||
|
||||
ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
|
54
debian/patches/patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch
vendored
Normal file
54
debian/patches/patchset-pf/fixes/0012-btrfs-fix-passing-0-to-ERR_PTR-in-btrfs_search_dir_i.patch
vendored
Normal file
@@ -0,0 +1,54 @@
|
||||
From ed73b9279db9536a9672cba6506950c26cedb140 Mon Sep 17 00:00:00 2001
|
||||
From: Yue Haibing <yuehaibing@huawei.com>
|
||||
Date: Tue, 22 Oct 2024 17:52:08 +0800
|
||||
Subject: btrfs: fix passing 0 to ERR_PTR in btrfs_search_dir_index_item()
|
||||
|
||||
The ret may be zero in btrfs_search_dir_index_item() and should not
|
||||
passed to ERR_PTR(). Now btrfs_unlink_subvol() is the only caller to
|
||||
this, reconstructed it to check ERR_PTR(-ENOENT) while ret >= 0.
|
||||
|
||||
This fixes smatch warnings:
|
||||
|
||||
fs/btrfs/dir-item.c:353
|
||||
btrfs_search_dir_index_item() warn: passing zero to 'ERR_PTR'
|
||||
|
||||
Fixes: 9dcbe16fccbb ("btrfs: use btrfs_for_each_slot in btrfs_search_dir_index_item")
|
||||
CC: stable@vger.kernel.org # 6.1+
|
||||
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
|
||||
Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
|
||||
Reviewed-by: David Sterba <dsterba@suse.com>
|
||||
Signed-off-by: David Sterba <dsterba@suse.com>
|
||||
---
|
||||
fs/btrfs/dir-item.c | 4 ++--
|
||||
fs/btrfs/inode.c | 7 ++-----
|
||||
2 files changed, 4 insertions(+), 7 deletions(-)
|
||||
|
||||
--- a/fs/btrfs/dir-item.c
|
||||
+++ b/fs/btrfs/dir-item.c
|
||||
@@ -347,8 +347,8 @@ btrfs_search_dir_index_item(struct btrfs
|
||||
return di;
|
||||
}
|
||||
/* Adjust return code if the key was not found in the next leaf. */
|
||||
- if (ret > 0)
|
||||
- ret = 0;
|
||||
+ if (ret >= 0)
|
||||
+ ret = -ENOENT;
|
||||
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
--- a/fs/btrfs/inode.c
|
||||
+++ b/fs/btrfs/inode.c
|
||||
@@ -4344,11 +4344,8 @@ static int btrfs_unlink_subvol(struct bt
|
||||
*/
|
||||
if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
|
||||
di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
|
||||
- if (IS_ERR_OR_NULL(di)) {
|
||||
- if (!di)
|
||||
- ret = -ENOENT;
|
||||
- else
|
||||
- ret = PTR_ERR(di);
|
||||
+ if (IS_ERR(di)) {
|
||||
+ ret = PTR_ERR(di);
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
goto out;
|
||||
}
|
407
debian/patches/patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch
vendored
Normal file
407
debian/patches/patchset-pf/ksm/0001-mm-expose-per-process-KSM-control-via-syscalls.patch
vendored
Normal file
@@ -0,0 +1,407 @@
|
||||
From 88362669534c70bbc7036f45bb23e63a30d4adfb Mon Sep 17 00:00:00 2001
|
||||
From: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Date: Mon, 29 Jul 2024 00:38:24 +0200
|
||||
Subject: mm: expose per-process KSM control via syscalls
|
||||
|
||||
d7597f59d1d3 added a new API to enable per-process KSM control. It
|
||||
however uses prctl, which doesn't allow controlling KSM from outside of
|
||||
the current process.
|
||||
|
||||
Hence, expose this API via 3 syscalls: process_ksm_enable,
|
||||
process_ksm_disable and process_ksm_status. Given sufficient privileges,
|
||||
auto-KSM can be enable by another process.
|
||||
|
||||
Since these syscalls are not in the upstream kernel, also expose their
|
||||
numbers under /sys/kernel/process_ksm so that userspace tooling like
|
||||
uksmd knows how to use them.
|
||||
|
||||
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
---
|
||||
arch/alpha/kernel/syscalls/syscall.tbl | 3 +
|
||||
arch/arm/tools/syscall.tbl | 3 +
|
||||
arch/m68k/kernel/syscalls/syscall.tbl | 3 +
|
||||
arch/microblaze/kernel/syscalls/syscall.tbl | 3 +
|
||||
arch/mips/kernel/syscalls/syscall_n32.tbl | 3 +
|
||||
arch/mips/kernel/syscalls/syscall_n64.tbl | 3 +
|
||||
arch/mips/kernel/syscalls/syscall_o32.tbl | 3 +
|
||||
arch/parisc/kernel/syscalls/syscall.tbl | 3 +
|
||||
arch/powerpc/kernel/syscalls/syscall.tbl | 3 +
|
||||
arch/s390/kernel/syscalls/syscall.tbl | 3 +
|
||||
arch/sh/kernel/syscalls/syscall.tbl | 3 +
|
||||
arch/sparc/kernel/syscalls/syscall.tbl | 3 +
|
||||
arch/x86/entry/syscalls/syscall_32.tbl | 3 +
|
||||
arch/x86/entry/syscalls/syscall_64.tbl | 3 +
|
||||
arch/xtensa/kernel/syscalls/syscall.tbl | 3 +
|
||||
include/linux/syscalls.h | 3 +
|
||||
include/uapi/asm-generic/unistd.h | 9 +-
|
||||
kernel/sys.c | 147 ++++++++++++++++++
|
||||
kernel/sys_ni.c | 3 +
|
||||
scripts/syscall.tbl | 3 +
|
||||
.../arch/powerpc/entry/syscalls/syscall.tbl | 3 +
|
||||
.../perf/arch/s390/entry/syscalls/syscall.tbl | 3 +
|
||||
22 files changed, 215 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/alpha/kernel/syscalls/syscall.tbl
|
||||
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
|
||||
@@ -502,3 +502,6 @@
|
||||
570 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
571 common lsm_list_modules sys_lsm_list_modules
|
||||
572 common mseal sys_mseal
|
||||
+573 common process_ksm_enable sys_process_ksm_enable
|
||||
+574 common process_ksm_disable sys_process_ksm_disable
|
||||
+575 common process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/arm/tools/syscall.tbl
|
||||
+++ b/arch/arm/tools/syscall.tbl
|
||||
@@ -477,3 +477,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/m68k/kernel/syscalls/syscall.tbl
|
||||
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
|
||||
@@ -462,3 +462,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
|
||||
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
|
||||
@@ -468,3 +468,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
|
||||
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
|
||||
@@ -401,3 +401,6 @@
|
||||
460 n32 lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 n32 lsm_list_modules sys_lsm_list_modules
|
||||
462 n32 mseal sys_mseal
|
||||
+463 n32 process_ksm_enable sys_process_ksm_enable
|
||||
+464 n32 process_ksm_disable sys_process_ksm_disable
|
||||
+465 n32 process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
|
||||
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
|
||||
@@ -377,3 +377,6 @@
|
||||
460 n64 lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 n64 lsm_list_modules sys_lsm_list_modules
|
||||
462 n64 mseal sys_mseal
|
||||
+463 n64 process_ksm_enable sys_process_ksm_enable
|
||||
+464 n64 process_ksm_disable sys_process_ksm_disable
|
||||
+465 n64 process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
|
||||
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
|
||||
@@ -450,3 +450,6 @@
|
||||
460 o32 lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 o32 lsm_list_modules sys_lsm_list_modules
|
||||
462 o32 mseal sys_mseal
|
||||
+463 o32 process_ksm_enable sys_process_ksm_enable
|
||||
+464 o32 process_ksm_disable sys_process_ksm_disable
|
||||
+465 o32 process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/parisc/kernel/syscalls/syscall.tbl
|
||||
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
|
||||
@@ -461,3 +461,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
|
||||
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
|
||||
@@ -553,3 +553,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/s390/kernel/syscalls/syscall.tbl
|
||||
+++ b/arch/s390/kernel/syscalls/syscall.tbl
|
||||
@@ -465,3 +465,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/sh/kernel/syscalls/syscall.tbl
|
||||
+++ b/arch/sh/kernel/syscalls/syscall.tbl
|
||||
@@ -466,3 +466,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/sparc/kernel/syscalls/syscall.tbl
|
||||
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
|
||||
@@ -508,3 +508,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
|
||||
@@ -468,3 +468,6 @@
|
||||
460 i386 lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 i386 lsm_list_modules sys_lsm_list_modules
|
||||
462 i386 mseal sys_mseal
|
||||
+463 i386 process_ksm_enable sys_process_ksm_enable
|
||||
+464 i386 process_ksm_disable sys_process_ksm_disable
|
||||
+465 i386 process_ksm_status sys_process_ksm_status
|
||||
--- a/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
|
||||
@@ -386,6 +386,9 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
|
||||
#
|
||||
# Due to a historical design error, certain syscalls are numbered differently
|
||||
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
|
||||
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
|
||||
@@ -433,3 +433,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/include/linux/syscalls.h
|
||||
+++ b/include/linux/syscalls.h
|
||||
@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned lon
|
||||
asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
|
||||
size_t vlen, int behavior, unsigned int flags);
|
||||
asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
|
||||
+asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags);
|
||||
+asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags);
|
||||
+asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags);
|
||||
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
|
||||
unsigned long prot, unsigned long pgoff,
|
||||
unsigned long flags);
|
||||
--- a/include/uapi/asm-generic/unistd.h
|
||||
+++ b/include/uapi/asm-generic/unistd.h
|
||||
@@ -841,8 +841,15 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm
|
||||
#define __NR_mseal 462
|
||||
__SYSCALL(__NR_mseal, sys_mseal)
|
||||
|
||||
+#define __NR_process_ksm_enable 463
|
||||
+__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
|
||||
+#define __NR_process_ksm_disable 464
|
||||
+__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
|
||||
+#define __NR_process_ksm_status 465
|
||||
+__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
|
||||
+
|
||||
#undef __NR_syscalls
|
||||
-#define __NR_syscalls 463
|
||||
+#define __NR_syscalls 466
|
||||
|
||||
/*
|
||||
* 32 bit systems traditionally used different
|
||||
--- a/kernel/sys.c
|
||||
+++ b/kernel/sys.c
|
||||
@@ -2789,6 +2789,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
|
||||
return error;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_KSM
|
||||
+enum pkc_action {
|
||||
+ PKSM_ENABLE = 0,
|
||||
+ PKSM_DISABLE,
|
||||
+ PKSM_STATUS,
|
||||
+};
|
||||
+
|
||||
+static long do_process_ksm_control(int pidfd, enum pkc_action action)
|
||||
+{
|
||||
+ long ret;
|
||||
+ struct pid *pid;
|
||||
+ struct task_struct *task;
|
||||
+ struct mm_struct *mm;
|
||||
+ unsigned int f_flags;
|
||||
+
|
||||
+ pid = pidfd_get_pid(pidfd, &f_flags);
|
||||
+ if (IS_ERR(pid)) {
|
||||
+ ret = PTR_ERR(pid);
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ task = get_pid_task(pid, PIDTYPE_PID);
|
||||
+ if (!task) {
|
||||
+ ret = -ESRCH;
|
||||
+ goto put_pid;
|
||||
+ }
|
||||
+
|
||||
+ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
|
||||
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
|
||||
+ if (IS_ERR_OR_NULL(mm)) {
|
||||
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
|
||||
+ goto release_task;
|
||||
+ }
|
||||
+
|
||||
+ /* Require CAP_SYS_NICE for influencing process performance. */
|
||||
+ if (!capable(CAP_SYS_NICE)) {
|
||||
+ ret = -EPERM;
|
||||
+ goto release_mm;
|
||||
+ }
|
||||
+
|
||||
+ if (mmap_write_lock_killable(mm)) {
|
||||
+ ret = -EINTR;
|
||||
+ goto release_mm;
|
||||
+ }
|
||||
+
|
||||
+ switch (action) {
|
||||
+ case PKSM_ENABLE:
|
||||
+ ret = ksm_enable_merge_any(mm);
|
||||
+ break;
|
||||
+ case PKSM_DISABLE:
|
||||
+ ret = ksm_disable_merge_any(mm);
|
||||
+ break;
|
||||
+ case PKSM_STATUS:
|
||||
+ ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ mmap_write_unlock(mm);
|
||||
+
|
||||
+release_mm:
|
||||
+ mmput(mm);
|
||||
+release_task:
|
||||
+ put_task_struct(task);
|
||||
+put_pid:
|
||||
+ put_pid(pid);
|
||||
+out:
|
||||
+ return ret;
|
||||
+}
|
||||
+#endif /* CONFIG_KSM */
|
||||
+
|
||||
+SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags)
|
||||
+{
|
||||
+#ifdef CONFIG_KSM
|
||||
+ if (flags != 0)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ return do_process_ksm_control(pidfd, PKSM_ENABLE);
|
||||
+#else /* CONFIG_KSM */
|
||||
+ return -ENOSYS;
|
||||
+#endif /* CONFIG_KSM */
|
||||
+}
|
||||
+
|
||||
+SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags)
|
||||
+{
|
||||
+#ifdef CONFIG_KSM
|
||||
+ if (flags != 0)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ return do_process_ksm_control(pidfd, PKSM_DISABLE);
|
||||
+#else /* CONFIG_KSM */
|
||||
+ return -ENOSYS;
|
||||
+#endif /* CONFIG_KSM */
|
||||
+}
|
||||
+
|
||||
+SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags)
|
||||
+{
|
||||
+#ifdef CONFIG_KSM
|
||||
+ if (flags != 0)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ return do_process_ksm_control(pidfd, PKSM_STATUS);
|
||||
+#else /* CONFIG_KSM */
|
||||
+ return -ENOSYS;
|
||||
+#endif /* CONFIG_KSM */
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_KSM
|
||||
+static ssize_t process_ksm_enable_show(struct kobject *kobj,
|
||||
+ struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "%u\n", __NR_process_ksm_enable);
|
||||
+}
|
||||
+static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable);
|
||||
+
|
||||
+static ssize_t process_ksm_disable_show(struct kobject *kobj,
|
||||
+ struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "%u\n", __NR_process_ksm_disable);
|
||||
+}
|
||||
+static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable);
|
||||
+
|
||||
+static ssize_t process_ksm_status_show(struct kobject *kobj,
|
||||
+ struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "%u\n", __NR_process_ksm_status);
|
||||
+}
|
||||
+static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status);
|
||||
+
|
||||
+static struct attribute *process_ksm_sysfs_attrs[] = {
|
||||
+ &process_ksm_enable_attr.attr,
|
||||
+ &process_ksm_disable_attr.attr,
|
||||
+ &process_ksm_status_attr.attr,
|
||||
+ NULL,
|
||||
+};
|
||||
+
|
||||
+static const struct attribute_group process_ksm_sysfs_attr_group = {
|
||||
+ .attrs = process_ksm_sysfs_attrs,
|
||||
+ .name = "process_ksm",
|
||||
+};
|
||||
+
|
||||
+static int __init process_ksm_sysfs_init(void)
|
||||
+{
|
||||
+ return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group);
|
||||
+}
|
||||
+subsys_initcall(process_ksm_sysfs_init);
|
||||
+#endif /* CONFIG_KSM */
|
||||
+
|
||||
SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
|
||||
struct getcpu_cache __user *, unused)
|
||||
{
|
||||
--- a/kernel/sys_ni.c
|
||||
+++ b/kernel/sys_ni.c
|
||||
@@ -186,6 +186,9 @@ COND_SYSCALL(mincore);
|
||||
COND_SYSCALL(madvise);
|
||||
COND_SYSCALL(process_madvise);
|
||||
COND_SYSCALL(process_mrelease);
|
||||
+COND_SYSCALL(process_ksm_enable);
|
||||
+COND_SYSCALL(process_ksm_disable);
|
||||
+COND_SYSCALL(process_ksm_status);
|
||||
COND_SYSCALL(remap_file_pages);
|
||||
COND_SYSCALL(mbind);
|
||||
COND_SYSCALL(get_mempolicy);
|
||||
--- a/scripts/syscall.tbl
|
||||
+++ b/scripts/syscall.tbl
|
||||
@@ -403,3 +403,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
|
||||
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
|
||||
@@ -553,3 +553,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status
|
||||
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
|
||||
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
|
||||
@@ -465,3 +465,6 @@
|
||||
460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr
|
||||
461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules
|
||||
462 common mseal sys_mseal sys_mseal
|
||||
+463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable
|
||||
+464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable
|
||||
+465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status
|
50
debian/patches/patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch
vendored
Normal file
50
debian/patches/patchset-pf/ksm/0002-mm-process_ksm-use-pidfd_get_task-instead-of-pidfd_g.patch
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
From 9308d03bfeb941469da17e2903ca06254b110b25 Mon Sep 17 00:00:00 2001
|
||||
From: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Date: Tue, 24 Sep 2024 11:58:41 +0200
|
||||
Subject: mm/process_ksm: use pidfd_get_task() instead of
|
||||
pidfd_get_pid()+get_pid_task()
|
||||
|
||||
Link: https://git.kernel.org/linus/ee9955d61a0a
|
||||
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
---
|
||||
kernel/sys.c | 15 +++------------
|
||||
1 file changed, 3 insertions(+), 12 deletions(-)
|
||||
|
||||
--- a/kernel/sys.c
|
||||
+++ b/kernel/sys.c
|
||||
@@ -2799,23 +2799,16 @@ enum pkc_action {
|
||||
static long do_process_ksm_control(int pidfd, enum pkc_action action)
|
||||
{
|
||||
long ret;
|
||||
- struct pid *pid;
|
||||
struct task_struct *task;
|
||||
struct mm_struct *mm;
|
||||
unsigned int f_flags;
|
||||
|
||||
- pid = pidfd_get_pid(pidfd, &f_flags);
|
||||
- if (IS_ERR(pid)) {
|
||||
- ret = PTR_ERR(pid);
|
||||
+ task = pidfd_get_task(pidfd, &f_flags);
|
||||
+ if (IS_ERR(task)) {
|
||||
+ ret = PTR_ERR(task);
|
||||
goto out;
|
||||
}
|
||||
|
||||
- task = get_pid_task(pid, PIDTYPE_PID);
|
||||
- if (!task) {
|
||||
- ret = -ESRCH;
|
||||
- goto put_pid;
|
||||
- }
|
||||
-
|
||||
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
|
||||
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
|
||||
if (IS_ERR_OR_NULL(mm)) {
|
||||
@@ -2852,8 +2845,6 @@ release_mm:
|
||||
mmput(mm);
|
||||
release_task:
|
||||
put_task_struct(task);
|
||||
-put_pid:
|
||||
- put_pid(pid);
|
||||
out:
|
||||
return ret;
|
||||
}
|
18533
debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch
vendored
Normal file
18533
debian/patches/patchset-pf/zstd/0001-zstd-import-upstream-v1.5.6.patch
vendored
Normal file
File diff suppressed because it is too large
Load Diff
58
debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch
vendored
Normal file
58
debian/patches/patchset-pf/zstd/0002-lib-zstd-Refactor-intentional-wrap-around-test.patch
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
From c09f361b41027ca073de5631c66dfe0e7275c3a4 Mon Sep 17 00:00:00 2001
|
||||
From: Kees Cook <keescook@chromium.org>
|
||||
Date: Mon, 22 Jan 2024 16:27:56 -0800
|
||||
Subject: lib: zstd: Refactor intentional wrap-around test
|
||||
|
||||
In an effort to separate intentional arithmetic wrap-around from
|
||||
unexpected wrap-around, we need to refactor places that depend on this
|
||||
kind of math. One of the most common code patterns of this is:
|
||||
|
||||
VAR + value < VAR
|
||||
|
||||
Notably, this is considered "undefined behavior" for signed and pointer
|
||||
types, which the kernel works around by using the -fno-strict-overflow
|
||||
option in the build[1] (which used to just be -fwrapv). Regardless, we
|
||||
want to get the kernel source to the position where we can meaningfully
|
||||
instrument arithmetic wrap-around conditions and catch them when they
|
||||
are unexpected, regardless of whether they are signed[2], unsigned[3],
|
||||
or pointer[4] types.
|
||||
|
||||
Switch to a more regular type for a 64-bit value and refactor the
|
||||
open-coded wrap-around addition test to use subtraction from the type max
|
||||
(since add_would_overflow() may not be defined in early boot code). This
|
||||
paves the way to enabling the wrap-around sanitizers in the future.
|
||||
|
||||
Link: https://git.kernel.org/linus/68df3755e383e6fecf2354a67b08f92f18536594 [1]
|
||||
Link: https://github.com/KSPP/linux/issues/26 [2]
|
||||
Link: https://github.com/KSPP/linux/issues/27 [3]
|
||||
Link: https://github.com/KSPP/linux/issues/344 [4]
|
||||
Cc: Nick Terrell <terrelln@fb.com>
|
||||
Cc: Paul Jones <paul@pauljones.id.au>
|
||||
Cc: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Cc: Xin Gao <gaoxin@cdjrlc.com>
|
||||
Signed-off-by: Kees Cook <keescook@chromium.org>
|
||||
---
|
||||
lib/zstd/decompress/zstd_decompress.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/lib/zstd/decompress/zstd_decompress.c
|
||||
+++ b/lib/zstd/decompress/zstd_decompress.c
|
||||
@@ -618,7 +618,7 @@ size_t ZSTD_readSkippableFrame(void* dst
|
||||
* @return : decompressed size of the frames contained */
|
||||
unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
|
||||
{
|
||||
- unsigned long long totalDstSize = 0;
|
||||
+ U64 totalDstSize = 0;
|
||||
|
||||
while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
|
||||
U32 const magicNumber = MEM_readLE32(src);
|
||||
@@ -636,7 +636,7 @@ unsigned long long ZSTD_findDecompressed
|
||||
{ unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
|
||||
if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
|
||||
|
||||
- if (totalDstSize + fcs < totalDstSize)
|
||||
+ if (U64_MAX - totalDstSize < fcs)
|
||||
return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
|
||||
totalDstSize += fcs;
|
||||
}
|
Reference in New Issue
Block a user