322 lines
9.5 KiB
Diff
322 lines
9.5 KiB
Diff
|
From 023d6b8aa8d8b346cfdcccf5ca4cb880c8d41d87 Mon Sep 17 00:00:00 2001
|
||
|
From: Kan Liang <kan.liang@linux.intel.com>
|
||
|
Date: Fri, 2 Aug 2024 08:16:37 -0700
|
||
|
Subject: perf: Generic hotplug support for a PMU with a scope
|
||
|
|
||
|
The perf subsystem assumes that the counters of a PMU are per-CPU. So
|
||
|
the user space tool reads a counter from each CPU in the system wide
|
||
|
mode. However, many PMUs don't have a per-CPU counter. The counter is
|
||
|
effective for a scope, e.g., a die or a socket. To address this, a
|
||
|
cpumask is exposed by the kernel driver to restrict to one CPU to stand
|
||
|
for a specific scope. In case the given CPU is removed,
|
||
|
the hotplug support has to be implemented for each such driver.
|
||
|
|
||
|
The codes to support the cpumask and hotplug are very similar.
|
||
|
- Expose a cpumask into sysfs
|
||
|
- Pickup another CPU in the same scope if the given CPU is removed.
|
||
|
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
|
||
|
- In event init, always set the CPU in the cpumask to event->cpu
|
||
|
|
||
|
Similar duplicated codes are implemented for each such PMU driver. It
|
||
|
would be good to introduce a generic infrastructure to avoid such
|
||
|
duplication.
|
||
|
|
||
|
5 popular scopes are implemented here, core, die, cluster, pkg, and
|
||
|
the system-wide. The scope can be set when a PMU is registered. If so, a
|
||
|
"cpumask" is automatically exposed for the PMU.
|
||
|
|
||
|
The "cpumask" is from the perf_online_<scope>_mask, which is to track
|
||
|
the active CPU for each scope. They are set when the first CPU of the
|
||
|
scope is online via the generic perf hotplug support. When a
|
||
|
corresponding CPU is removed, the perf_online_<scope>_mask is updated
|
||
|
accordingly and the PMU will be moved to a new CPU from the same scope
|
||
|
if possible.
|
||
|
|
||
|
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
|
||
|
---
|
||
|
include/linux/perf_event.h | 18 ++++
|
||
|
kernel/events/core.c | 164 ++++++++++++++++++++++++++++++++++++-
|
||
|
2 files changed, 180 insertions(+), 2 deletions(-)
|
||
|
|
||
|
--- a/include/linux/perf_event.h
|
||
|
+++ b/include/linux/perf_event.h
|
||
|
@@ -292,6 +292,19 @@ struct perf_event_pmu_context;
|
||
|
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
|
||
|
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
|
||
|
|
||
|
+/**
|
||
|
+ * pmu::scope
|
||
|
+ */
|
||
|
+enum perf_pmu_scope {
|
||
|
+ PERF_PMU_SCOPE_NONE = 0,
|
||
|
+ PERF_PMU_SCOPE_CORE,
|
||
|
+ PERF_PMU_SCOPE_DIE,
|
||
|
+ PERF_PMU_SCOPE_CLUSTER,
|
||
|
+ PERF_PMU_SCOPE_PKG,
|
||
|
+ PERF_PMU_SCOPE_SYS_WIDE,
|
||
|
+ PERF_PMU_MAX_SCOPE,
|
||
|
+};
|
||
|
+
|
||
|
struct perf_output_handle;
|
||
|
|
||
|
#define PMU_NULL_DEV ((void *)(~0UL))
|
||
|
@@ -315,6 +328,11 @@ struct pmu {
|
||
|
*/
|
||
|
int capabilities;
|
||
|
|
||
|
+ /*
|
||
|
+ * PMU scope
|
||
|
+ */
|
||
|
+ unsigned int scope;
|
||
|
+
|
||
|
int __percpu *pmu_disable_count;
|
||
|
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
|
||
|
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
|
||
|
--- a/kernel/events/core.c
|
||
|
+++ b/kernel/events/core.c
|
||
|
@@ -411,6 +411,11 @@ static LIST_HEAD(pmus);
|
||
|
static DEFINE_MUTEX(pmus_lock);
|
||
|
static struct srcu_struct pmus_srcu;
|
||
|
static cpumask_var_t perf_online_mask;
|
||
|
+static cpumask_var_t perf_online_core_mask;
|
||
|
+static cpumask_var_t perf_online_die_mask;
|
||
|
+static cpumask_var_t perf_online_cluster_mask;
|
||
|
+static cpumask_var_t perf_online_pkg_mask;
|
||
|
+static cpumask_var_t perf_online_sys_mask;
|
||
|
static struct kmem_cache *perf_event_cache;
|
||
|
|
||
|
/*
|
||
|
@@ -11497,10 +11502,60 @@ perf_event_mux_interval_ms_store(struct
|
||
|
}
|
||
|
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
|
||
|
|
||
|
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
|
||
|
+{
|
||
|
+ switch (scope) {
|
||
|
+ case PERF_PMU_SCOPE_CORE:
|
||
|
+ return topology_sibling_cpumask(cpu);
|
||
|
+ case PERF_PMU_SCOPE_DIE:
|
||
|
+ return topology_die_cpumask(cpu);
|
||
|
+ case PERF_PMU_SCOPE_CLUSTER:
|
||
|
+ return topology_cluster_cpumask(cpu);
|
||
|
+ case PERF_PMU_SCOPE_PKG:
|
||
|
+ return topology_core_cpumask(cpu);
|
||
|
+ case PERF_PMU_SCOPE_SYS_WIDE:
|
||
|
+ return cpu_online_mask;
|
||
|
+ }
|
||
|
+
|
||
|
+ return NULL;
|
||
|
+}
|
||
|
+
|
||
|
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
|
||
|
+{
|
||
|
+ switch (scope) {
|
||
|
+ case PERF_PMU_SCOPE_CORE:
|
||
|
+ return perf_online_core_mask;
|
||
|
+ case PERF_PMU_SCOPE_DIE:
|
||
|
+ return perf_online_die_mask;
|
||
|
+ case PERF_PMU_SCOPE_CLUSTER:
|
||
|
+ return perf_online_cluster_mask;
|
||
|
+ case PERF_PMU_SCOPE_PKG:
|
||
|
+ return perf_online_pkg_mask;
|
||
|
+ case PERF_PMU_SCOPE_SYS_WIDE:
|
||
|
+ return perf_online_sys_mask;
|
||
|
+ }
|
||
|
+
|
||
|
+ return NULL;
|
||
|
+}
|
||
|
+
|
||
|
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
|
||
|
+ char *buf)
|
||
|
+{
|
||
|
+ struct pmu *pmu = dev_get_drvdata(dev);
|
||
|
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
|
||
|
+
|
||
|
+ if (mask)
|
||
|
+ return cpumap_print_to_pagebuf(true, buf, mask);
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
+static DEVICE_ATTR_RO(cpumask);
|
||
|
+
|
||
|
static struct attribute *pmu_dev_attrs[] = {
|
||
|
&dev_attr_type.attr,
|
||
|
&dev_attr_perf_event_mux_interval_ms.attr,
|
||
|
&dev_attr_nr_addr_filters.attr,
|
||
|
+ &dev_attr_cpumask.attr,
|
||
|
NULL,
|
||
|
};
|
||
|
|
||
|
@@ -11512,6 +11567,10 @@ static umode_t pmu_dev_is_visible(struct
|
||
|
if (n == 2 && !pmu->nr_addr_filters)
|
||
|
return 0;
|
||
|
|
||
|
+ /* cpumask */
|
||
|
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
|
||
|
+ return 0;
|
||
|
+
|
||
|
return a->mode;
|
||
|
}
|
||
|
|
||
|
@@ -11596,6 +11655,11 @@ int perf_pmu_register(struct pmu *pmu, c
|
||
|
goto free_pdc;
|
||
|
}
|
||
|
|
||
|
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
|
||
|
+ ret = -EINVAL;
|
||
|
+ goto free_pdc;
|
||
|
+ }
|
||
|
+
|
||
|
pmu->name = name;
|
||
|
|
||
|
if (type >= 0)
|
||
|
@@ -11750,6 +11814,22 @@ static int perf_try_init_event(struct pm
|
||
|
event_has_any_exclude_flag(event))
|
||
|
ret = -EINVAL;
|
||
|
|
||
|
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
|
||
|
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
|
||
|
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
|
||
|
+ int cpu;
|
||
|
+
|
||
|
+ if (pmu_cpumask && cpumask) {
|
||
|
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
|
||
|
+ if (cpu >= nr_cpu_ids)
|
||
|
+ ret = -ENODEV;
|
||
|
+ else
|
||
|
+ event->cpu = cpu;
|
||
|
+ } else {
|
||
|
+ ret = -ENODEV;
|
||
|
+ }
|
||
|
+ }
|
||
|
+
|
||
|
if (ret && event->destroy)
|
||
|
event->destroy(event);
|
||
|
}
|
||
|
@@ -13713,6 +13793,12 @@ static void __init perf_event_init_all_c
|
||
|
int cpu;
|
||
|
|
||
|
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
|
||
|
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
|
||
|
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
|
||
|
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
|
||
|
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
|
||
|
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
|
||
|
+
|
||
|
|
||
|
for_each_possible_cpu(cpu) {
|
||
|
swhash = &per_cpu(swevent_htable, cpu);
|
||
|
@@ -13762,6 +13848,40 @@ static void __perf_event_exit_context(vo
|
||
|
raw_spin_unlock(&ctx->lock);
|
||
|
}
|
||
|
|
||
|
+static void perf_event_clear_cpumask(unsigned int cpu)
|
||
|
+{
|
||
|
+ int target[PERF_PMU_MAX_SCOPE];
|
||
|
+ unsigned int scope;
|
||
|
+ struct pmu *pmu;
|
||
|
+
|
||
|
+ cpumask_clear_cpu(cpu, perf_online_mask);
|
||
|
+
|
||
|
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
|
||
|
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
|
||
|
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
|
||
|
+
|
||
|
+ target[scope] = -1;
|
||
|
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
|
||
|
+ continue;
|
||
|
+
|
||
|
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
|
||
|
+ continue;
|
||
|
+ target[scope] = cpumask_any_but(cpumask, cpu);
|
||
|
+ if (target[scope] < nr_cpu_ids)
|
||
|
+ cpumask_set_cpu(target[scope], pmu_cpumask);
|
||
|
+ }
|
||
|
+
|
||
|
+ /* migrate */
|
||
|
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
|
||
|
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
|
||
|
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
|
||
|
+ continue;
|
||
|
+
|
||
|
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
|
||
|
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
static void perf_event_exit_cpu_context(int cpu)
|
||
|
{
|
||
|
struct perf_cpu_context *cpuctx;
|
||
|
@@ -13769,6 +13889,11 @@ static void perf_event_exit_cpu_context(
|
||
|
|
||
|
// XXX simplify cpuctx->online
|
||
|
mutex_lock(&pmus_lock);
|
||
|
+ /*
|
||
|
+ * Clear the cpumasks, and migrate to other CPUs if possible.
|
||
|
+ * Must be invoked before the __perf_event_exit_context.
|
||
|
+ */
|
||
|
+ perf_event_clear_cpumask(cpu);
|
||
|
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
|
||
|
ctx = &cpuctx->ctx;
|
||
|
|
||
|
@@ -13776,7 +13901,6 @@ static void perf_event_exit_cpu_context(
|
||
|
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
|
||
|
cpuctx->online = 0;
|
||
|
mutex_unlock(&ctx->mutex);
|
||
|
- cpumask_clear_cpu(cpu, perf_online_mask);
|
||
|
mutex_unlock(&pmus_lock);
|
||
|
}
|
||
|
#else
|
||
|
@@ -13785,6 +13909,42 @@ static void perf_event_exit_cpu_context(
|
||
|
|
||
|
#endif
|
||
|
|
||
|
+static void perf_event_setup_cpumask(unsigned int cpu)
|
||
|
+{
|
||
|
+ struct cpumask *pmu_cpumask;
|
||
|
+ unsigned int scope;
|
||
|
+
|
||
|
+ cpumask_set_cpu(cpu, perf_online_mask);
|
||
|
+
|
||
|
+ /*
|
||
|
+ * Early boot stage, the cpumask hasn't been set yet.
|
||
|
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
|
||
|
+ * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
|
||
|
+ */
|
||
|
+ if (!topology_sibling_cpumask(cpu)) {
|
||
|
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
|
||
|
+ pmu_cpumask = perf_scope_cpumask(scope);
|
||
|
+ if (WARN_ON_ONCE(!pmu_cpumask))
|
||
|
+ continue;
|
||
|
+ cpumask_set_cpu(cpu, pmu_cpumask);
|
||
|
+ }
|
||
|
+ return;
|
||
|
+ }
|
||
|
+
|
||
|
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
|
||
|
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
|
||
|
+
|
||
|
+ pmu_cpumask = perf_scope_cpumask(scope);
|
||
|
+
|
||
|
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
|
||
|
+ continue;
|
||
|
+
|
||
|
+ if (!cpumask_empty(cpumask) &&
|
||
|
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
|
||
|
+ cpumask_set_cpu(cpu, pmu_cpumask);
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
int perf_event_init_cpu(unsigned int cpu)
|
||
|
{
|
||
|
struct perf_cpu_context *cpuctx;
|
||
|
@@ -13793,7 +13953,7 @@ int perf_event_init_cpu(unsigned int cpu
|
||
|
perf_swevent_init_cpu(cpu);
|
||
|
|
||
|
mutex_lock(&pmus_lock);
|
||
|
- cpumask_set_cpu(cpu, perf_online_mask);
|
||
|
+ perf_event_setup_cpumask(cpu);
|
||
|
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
|
||
|
ctx = &cpuctx->ctx;
|
||
|
|