1
0

add 3rd party/custom patches

3rd patchs (in alphabetical order):
- bbr3
- ntsync5
- openwrt
- pf-kernel
- xanmod
- zen

no configuration changes for now
This commit is contained in:
Konstantin Demin 2024-10-29 05:12:06 +03:00
parent 8082dfeaca
commit 8cbaf1dea2
186 changed files with 43626 additions and 0 deletions

61
debian/bin/genpatch-pfkernel vendored Executable file
View File

@ -0,0 +1,61 @@
#!/bin/sh
set -ef
export GIT_OPTIONAL_LOCKS=0
w=$(git rev-parse --path-format=absolute --show-toplevel) ; : "${w:?}" ; cd "$w"
dst='debian/patches/pf'
src='../linux-extras'
branches='amd-pstate amd-rapl cpuidle crypto fixes ksm zstd'
[ -d "${dst}" ]
kver=
if [ -n "$1" ] ; then
kver="$1"
else
kver=$(dpkg-parsechangelog --show-field=Version | sed -E 's/^[0-9]+://;s/-[^-]*$//' | cut -d. -f1-2)
fi
from="upstream/linux-${kver}.y"
t=$(mktemp -d) ; : "${t:?}"
cp -ar "${src}" "$t/"
cd "$t/${src##*/}"
git config advice.skippedCherryPicks false
for b in ${branches} ; do
ref="pf/$b-${kver}"
r="tmp-rebase-$b"
git switch --detach "${ref}"
git switch -C "$r"
if git rebase "${from}" ; then
[ -d "$w/${dst}/$b/" ] || mkdir -p "$w/${dst}/$b"
set +e
env -C "$w" git ls-files -z | grep -zF "${dst}/$b/" | grep -zFv '/.' | env -C "$w" -u GIT_OPTIONAL_LOCKS xargs -r -0 git rm -f
find "$w/${dst}/$b/" -name '*.patch' -type f -exec rm -f {} +
set -e
git format-patch -N --subject-prefix='' --output-directory "$w/${dst}/$b" "${from}..$r"
else
echo >&2
git rebase --abort
echo >&2
fi
git switch -q --detach "${ref}"
git branch -D "$r"
echo >&2
done
cd "$w" ; rm -rf "$t"
echo >&2
echo 'put in debian/patches/series' >&2
echo >&2
find "${dst}/" -type f -name '*.patch' | sed -E 's#^debian/patches/##' | sort -V

View File

@ -0,0 +1,52 @@
this reverts following commit:
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 14 Jan 2021 16:32:42 -0600
Subject: objtool: Don't fail the kernel build on fatal errors
[ Upstream commit 655cf86548a3938538642a6df27dd359e13c86bd ]
This is basically a revert of commit 644592d32837 ("objtool: Fail the
kernel build on fatal errors").
That change turned out to be more trouble than it's worth. Failing the
build is an extreme measure which sometimes gets too much attention and
blocks CI build testing.
These fatal-type warnings aren't yet as rare as we'd hope, due to the
ever-increasing matrix of supported toolchains/plugins and their
fast-changing nature as of late.
Also, there are more people (and bots) looking for objtool warnings than
ever before, so even non-fatal warnings aren't likely to be ignored for
long.
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -4872,10 +4872,14 @@ int check(struct objtool_file *file)
}
out:
- /*
- * For now, don't fail the kernel build on fatal warnings. These
- * errors are still fairly common due to the growing matrix of
- * supported toolchains and their recent pace of change.
- */
+ if (ret < 0) {
+ /*
+ * Fatal error. The binary is corrupt or otherwise broken in
+ * some way, or objtool itself is broken. Fail the kernel
+ * build.
+ */
+ return ret;
+ }
+
return 0;
}

View File

@ -0,0 +1,11 @@
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -61,7 +61,7 @@ enum nf_ct_tcp_action {
static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
[TCP_CONNTRACK_SYN_SENT] = 2 MINS,
[TCP_CONNTRACK_SYN_RECV] = 60 SECS,
- [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS,
+ [TCP_CONNTRACK_ESTABLISHED] = 128 MINS,
[TCP_CONNTRACK_FIN_WAIT] = 2 MINS,
[TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS,
[TCP_CONNTRACK_LAST_ACK] = 30 SECS,

View File

@ -0,0 +1,11 @@
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1802,7 +1802,7 @@ static __net_init int inet_init_net(stru
/*
* Set defaults for local port range
*/
- net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u;
+ net->ipv4.ip_local_ports.range = 65533u << 16 | 49152u;
seqlock_init(&net->ipv4.ping_group_range.lock);
/*

View File

@ -0,0 +1,37 @@
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -374,7 +374,11 @@ static rx_handler_result_t br_handle_fra
return RX_HANDLER_PASS;
case 0x01: /* IEEE MAC (Pause) */
- goto drop;
+ fwd_mask |= p->br->group_fwd_mask;
+ if (fwd_mask & (1u << dest[5]))
+ goto forward;
+ else
+ goto drop;
case 0x0E: /* 802.1AB LLDP */
fwd_mask |= p->br->group_fwd_mask;
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1365,8 +1365,6 @@ static int br_changelink(struct net_devi
if (data[IFLA_BR_GROUP_FWD_MASK]) {
u16 fwd_mask = nla_get_u16(data[IFLA_BR_GROUP_FWD_MASK]);
- if (fwd_mask & BR_GROUPFWD_RESTRICTED)
- return -EINVAL;
br->group_fwd_mask = fwd_mask;
}
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -179,8 +179,6 @@ static ssize_t group_fwd_mask_show(struc
static int set_group_fwd_mask(struct net_bridge *br, unsigned long val,
struct netlink_ext_ack *extack)
{
- if (val & BR_GROUPFWD_RESTRICTED)
- return -EINVAL;
br->group_fwd_mask = val;

View File

@ -0,0 +1,52 @@
From ce1cd7869a208112a8728d1fe9e373f78a2e4a6e Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 11 Jun 2019 12:26:55 -0400
Subject: [PATCH 01/19] net-tcp_bbr: broaden app-limited rate sample detection
This commit is a bug fix for the Linux TCP app-limited
(application-limited) logic that is used for collecting rate
(bandwidth) samples.
Previously the app-limited logic only looked for "bubbles" of
silence in between application writes, by checking at the start
of each sendmsg. But "bubbles" of silence can also happen before
retransmits: e.g. bubbles can happen between an application write
and a retransmit, or between two retransmits.
Retransmits are triggered by ACKs or timers. So this commit checks
for bubbles of app-limited silence upon ACKs or timers.
Why does this commit check for app-limited state at the start of
ACKs and timer handling? Because at that point we know whether
inflight was fully using the cwnd. During processing the ACK or
timer event we often change the cwnd; after changing the cwnd we
can't know whether inflight was fully using the old cwnd.
Origin-9xx-SHA1: 3fe9b53291e018407780fb8c356adb5666722cbc
Change-Id: I37221506f5166877c2b110753d39bb0757985e68
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
net/ipv4/tcp_input.c | 1 +
net/ipv4/tcp_timer.c | 1 +
2 files changed, 2 insertions(+)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3961,6 +3961,7 @@ static int tcp_ack(struct sock *sk, cons
prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
rs.prior_in_flight = tcp_packets_in_flight(tp);
+ tcp_rate_check_app_limited(sk);
/* ts_recent update must be made after we are sure that the packet
* is in window.
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock
return;
}
+ tcp_rate_check_app_limited(sk);
tcp_mstamp_refresh(tcp_sk(sk));
event = icsk->icsk_pending;

View File

@ -0,0 +1,74 @@
From b32715fbe2ab96d1060ec37bb9c03feedf366494 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 24 Jun 2018 21:55:59 -0400
Subject: [PATCH 02/19] net-tcp_bbr: v2: shrink delivered_mstamp,
first_tx_mstamp to u32 to free up 8 bytes
Free up some space for tracking inflight and losses for each
bw sample, in upcoming commits.
These timestamps are in microseconds, and are now stored in 32
bits. So they can only hold time intervals up to roughly 2^12 = 4096
seconds. But Linux TCP RTT and RTO tracking has the same 32-bit
microsecond implementation approach and resulting deployment
limitations. So this is not introducing a new limit. And these should
not be a limitation for the foreseeable future.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 238a7e6b5d51625fef1ce7769826a7b21b02ae55
Change-Id: I3b779603797263b52a61ad57c565eb91fe42680c
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 9 +++++++--
net/ipv4/tcp_rate.c | 7 ++++---
2 files changed, 11 insertions(+), 5 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -884,6 +884,11 @@ static inline u32 tcp_stamp_us_delta(u64
return max_t(s64, t1 - t0, 0);
}
+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
+{
+ return max_t(s32, t1 - t0, 0);
+}
+
/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
@@ -973,9 +978,9 @@ struct tcp_skb_cb {
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
__u32 delivered;
/* start of send pipeline phase */
- u64 first_tx_mstamp;
+ u32 first_tx_mstamp;
/* when we reached the "delivered" count */
- u64 delivered_mstamp;
+ u32 delivered_mstamp;
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -101,8 +101,9 @@ void tcp_rate_skb_delivered(struct sock
/* Record send time of most recently ACKed packet: */
tp->first_tx_mstamp = tx_tstamp;
/* Find the duration of the "send phase" of this window: */
- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
- scb->tx.first_tx_mstamp);
+ rs->interval_us = tcp_stamp32_us_delta(
+ tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);
}
/* Mark off the skb delivered once it's sacked to avoid being
@@ -155,7 +156,7 @@ void tcp_rate_gen(struct sock *sk, u32 d
* longer phase.
*/
snd_us = rs->interval_us; /* send phase */
- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);

View File

@ -0,0 +1,109 @@
From 25856231832186fe13189b986cc0e91860c18201 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 5 Aug 2017 11:49:50 -0400
Subject: [PATCH 03/19] net-tcp_bbr: v2: snapshot packets in flight at transmit
time and pass in rate_sample
CC algorithms may want to snapshot the number of packets in flight at
transmit time and pass in rate_sample, to understand the relationship
between inflight and losses or ECN signals, to try to find the highest
inflight value that has acceptable levels of loss/ECN marking.
We split out the code to set an skb's tx.in_flight field into its own
function, so that this code can be used for the TCP_REPAIR "fake send"
code path that inserts skbs into the rtx queue without sending them.
Effort: net-tcp_bbr
Origin-9xx-SHA1: b3eb4f2d20efab4ca001f32c9294739036c493ea
Origin-9xx-SHA1: e880fc907d06ea7354333f60f712748ebce9497b
Origin-9xx-SHA1: 330f825a08a6fe92cef74d799cc468864c479f63
Change-Id: I7314047d0ff14dd261a04b1969a46dc658c8836a
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 6 ++++++
net/ipv4/tcp_output.c | 1 +
net/ipv4/tcp_rate.c | 20 ++++++++++++++++++++
3 files changed, 27 insertions(+)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -981,6 +981,10 @@ struct tcp_skb_cb {
u32 first_tx_mstamp;
/* when we reached the "delivered" count */
u32 delivered_mstamp;
+#define TCPCB_IN_FLIGHT_BITS 20
+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
+ u32 in_flight:20, /* packets in flight at transmit */
+ unused2:12;
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
@@ -1136,6 +1140,7 @@ struct rate_sample {
u64 prior_mstamp; /* starting timestamp for interval */
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
+ u32 tx_in_flight; /* packets in flight at starting timestamp */
s32 delivered; /* number of packets delivered over interval */
s32 delivered_ce; /* number of packets delivered w/ CE marks*/
long interval_us; /* time for tp->delivered to incr "delivered" */
@@ -1258,6 +1263,7 @@ static inline void tcp_ca_event(struct s
void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
/* From tcp_rate.c */
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
struct rate_sample *rs);
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2765,6 +2765,7 @@ static bool tcp_write_xmit(struct sock *
skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
tcp_init_tso_segs(skb, mss_now);
+ tcp_set_tx_in_flight(sk, skb);
goto repair; /* Skip network transmission */
}
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -34,6 +34,24 @@
* ready to send in the write queue.
*/
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 in_flight;
+
+ /* Check, sanitize, and record packets in flight after skb was sent. */
+ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
+ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
+ "insane in_flight %u cc %s mss %u "
+ "cwnd %u pif %u %u %u %u\n",
+ in_flight, inet_csk(sk)->icsk_ca_ops->name,
+ tp->mss_cache, tp->snd_cwnd,
+ tp->packets_out, tp->retrans_out,
+ tp->sacked_out, tp->lost_out))
+ in_flight = TCPCB_IN_FLIGHT_MAX;
+ TCP_SKB_CB(skb)->tx.in_flight = in_flight;
+}
+
/* Snapshot the current delivery information in the skb, to generate
* a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
*/
@@ -67,6 +85,7 @@ void tcp_rate_skb_sent(struct sock *sk,
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+ tcp_set_tx_in_flight(sk, skb);
}
/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
@@ -96,6 +115,7 @@ void tcp_rate_skb_delivered(struct sock
rs->prior_mstamp = scb->tx.delivered_mstamp;
rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ rs->tx_in_flight = scb->tx.in_flight;
rs->last_end_seq = scb->end_seq;
/* Record send time of most recently ACKed packet: */

View File

@ -0,0 +1,70 @@
From b1772710e8b5b98c09e96d4f1af620cd938fddf7 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 12 Oct 2017 23:44:27 -0400
Subject: [PATCH 04/19] net-tcp_bbr: v2: count packets lost over TCP rate
sampling interval
For understanding the relationship between inflight and packet loss
signals, to try to find the highest inflight value that has acceptable
levels of packet losses.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 4527e26b2bd7756a88b5b9ef1ada3da33dd609ab
Change-Id: I594c2500868d9c530770e7ddd68ffc87c57f4fd5
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 5 ++++-
net/ipv4/tcp_rate.c | 3 +++
2 files changed, 7 insertions(+), 1 deletion(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -985,6 +985,7 @@ struct tcp_skb_cb {
#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
u32 in_flight:20, /* packets in flight at transmit */
unused2:12;
+ u32 lost; /* packets lost so far upon tx of skb */
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
@@ -1138,11 +1139,13 @@ struct ack_sample {
*/
struct rate_sample {
u64 prior_mstamp; /* starting timestamp for interval */
+ u32 prior_lost; /* tp->lost at "prior_mstamp" */
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
u32 tx_in_flight; /* packets in flight at starting timestamp */
+ s32 lost; /* number of packets lost over interval */
s32 delivered; /* number of packets delivered over interval */
- s32 delivered_ce; /* number of packets delivered w/ CE marks*/
+ s32 delivered_ce; /* packets delivered w/ CE mark over interval */
long interval_us; /* time for tp->delivered to incr "delivered" */
u32 snd_interval_us; /* snd interval for delivered packets */
u32 rcv_interval_us; /* rcv interval for delivered packets */
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -84,6 +84,7 @@ void tcp_rate_skb_sent(struct sock *sk,
TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
+ TCP_SKB_CB(skb)->tx.lost = tp->lost;
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
tcp_set_tx_in_flight(sk, skb);
}
@@ -110,6 +111,7 @@ void tcp_rate_skb_delivered(struct sock
if (!rs->prior_delivered ||
tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
scb->end_seq, rs->last_end_seq)) {
+ rs->prior_lost = scb->tx.lost;
rs->prior_delivered_ce = scb->tx.delivered_ce;
rs->prior_delivered = scb->tx.delivered;
rs->prior_mstamp = scb->tx.delivered_mstamp;
@@ -165,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 d
return;
}
rs->delivered = tp->delivered - rs->prior_delivered;
+ rs->lost = tp->lost - rs->prior_lost;
rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
/* delivered_ce occupies less than 32 bits in the skb control block */

View File

@ -0,0 +1,38 @@
From fdf01142aea8645186e080f1278d3b5a5fd8c66c Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Nov 2018 13:48:36 -0500
Subject: [PATCH 05/19] net-tcp_bbr: v2: export FLAG_ECE in rate_sample.is_ece
For understanding the relationship between inflight and ECN signals,
to try to find the highest inflight value that has acceptable levels
ECN marking.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 3eba998f2898541406c2666781182200934965a8
Change-Id: I3a964e04cee83e11649a54507043d2dfe769a3b3
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 1 +
net/ipv4/tcp_input.c | 1 +
2 files changed, 2 insertions(+)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1157,6 +1157,7 @@ struct rate_sample {
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
bool is_ack_delayed; /* is this (likely) a delayed ACK? */
+ bool is_ece; /* did this ACK have ECN marked? */
};
struct tcp_congestion_ops {
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4060,6 +4060,7 @@ static int tcp_ack(struct sock *sk, cons
delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
+ rs.is_ece = !!(flag & FLAG_ECE);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);

View File

@ -0,0 +1,57 @@
From a3e88432c2ebf12de9c2053a13417ddf2ad4cb4e Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Tue, 7 Aug 2018 21:52:06 -0400
Subject: [PATCH 06/19] net-tcp_bbr: v2: introduce ca_ops->skb_marked_lost() CC
module callback API
For connections experiencing reordering, RACK can mark packets lost
long after we receive the SACKs/ACKs hinting that the packets were
actually lost.
This means that CC modules cannot easily learn the volume of inflight
data at which packet loss happens by looking at the current inflight
or even the packets in flight when the most recently SACKed packet was
sent. To learn this, CC modules need to know how many packets were in
flight at the time lost packets were sent. This new callback, combined
with TCP_SKB_CB(skb)->tx.in_flight, allows them to learn this.
This also provides a consistent callback that is invoked whether
packets are marked lost upon ACK processing, using the RACK reordering
timer, or at RTO time.
Effort: net-tcp_bbr
Origin-9xx-SHA1: afcbebe3374e4632ac6714d39e4dc8a8455956f4
Change-Id: I54826ab53df636be537e5d3c618a46145d12d51a
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 3 +++
net/ipv4/tcp_input.c | 5 +++++
2 files changed, 8 insertions(+)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1184,6 +1184,9 @@ struct tcp_congestion_ops {
/* override sysctl_tcp_min_tso_segs */
u32 (*min_tso_segs)(struct sock *sk);
+ /* react to a specific lost skb (optional) */
+ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
+
/* call when packets are delivered to update cwnd and pacing rate,
* after all the ca_state processing. (optional)
*/
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(s
*/
static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
{
+ struct sock *sk = (struct sock *)tp;
+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
tp->lost += tcp_skb_pcount(skb);
+ if (ca_ops->skb_marked_lost)
+ ca_ops->skb_marked_lost(sk, skb);
}
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)

View File

@ -0,0 +1,59 @@
From af7d33e71649b8e2ae00dccf336720a8ab891606 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 1 May 2019 20:16:33 -0400
Subject: [PATCH 07/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon merge in
tcp_shifted_skb()
When tcp_shifted_skb() updates state as adjacent SACKed skbs are
coalesced, previously the tx.in_flight was not adjusted, so we could
get contradictory state where the skb's recorded pcount was bigger
than the tx.in_flight (the number of segments that were in_flight
after sending the skb).
Normally have a SACKed skb with contradictory pcount/tx.in_flight
would not matter. However, with SACK reneging, the SACKed bit is
removed, and an skb once again becomes eligible for retransmitting,
fragmenting, SACKing, etc. Packetdrill testing verified the following
sequence is possible in a kernel that does not have this commit:
- skb N is SACKed
- skb N+1 is SACKed and combined with skb N using tcp_shifted_skb()
- tcp_shifted_skb() will increase the pcount of prev,
but leave tx.in_flight as-is
- so prev skb can have pcount > tx.in_flight
- RTO, tcp_timeout_mark_lost(), detect reneg,
remove "SACKed" bit, mark skb N as lost
- find pcount of skb N is greater than its tx.in_flight
I suspect this issue iw what caused the bbr2_inflight_hi_from_lost_skb():
WARN_ON_ONCE(inflight_prev < 0)
to fire in production machines using bbr2.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 1a3e997e613d2dcf32b947992882854ebe873715
Change-Id: I1b0b75c27519953430c7db51c6f358f104c7af55
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
net/ipv4/tcp_input.c | 11 +++++++++++
1 file changed, 11 insertions(+)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1506,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock
WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
tcp_skb_pcount_add(skb, -pcount);
+ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */
+ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
+ "prev in_flight: %u skb in_flight: %u pcount: %u",
+ TCP_SKB_CB(prev)->tx.in_flight,
+ TCP_SKB_CB(skb)->tx.in_flight,
+ pcount))
+ TCP_SKB_CB(skb)->tx.in_flight = 0;
+ else
+ TCP_SKB_CB(skb)->tx.in_flight -= pcount;
+ TCP_SKB_CB(prev)->tx.in_flight += pcount;
+
/* When we're adding to gso_segs == 1, gso_size will be zero,
* in theory this shouldn't be necessary but as long as DSACK
* code can come after this skb later on it's better to keep

View File

@ -0,0 +1,97 @@
From a4d44bce49f61f8755f558dc40edff5f8958b7c6 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 1 May 2019 20:16:25 -0400
Subject: [PATCH 08/19] net-tcp_bbr: v2: adjust skb tx.in_flight upon split in
tcp_fragment()
When we fragment an skb that has already been sent, we need to update
the tx.in_flight for the first skb in the resulting pair ("buff").
Because we were not updating the tx.in_flight, the tx.in_flight value
was inconsistent with the pcount of the "buff" skb (tx.in_flight would
be too high). That meant that if the "buff" skb was lost, then
bbr2_inflight_hi_from_lost_skb() would calculate an inflight_hi value
that is too high. This could result in longer queues and higher packet
loss.
Packetdrill testing verified that without this commit, when the second
half of an skb is SACKed and then later the first half of that skb is
marked lost, the calculated inflight_hi was incorrect.
Effort: net-tcp_bbr
Origin-9xx-SHA1: 385f1ddc610798fab2837f9f372857438b25f874
Origin-9xx-SHA1: a0eb099690af net-tcp_bbr: v2: fix tcp_fragment() tx.in_flight recomputation [prod feb 8 2021; use as a fixup]
Origin-9xx-SHA1: 885503228153ff0c9114e net-tcp_bbr: v2: introduce tcp_skb_tx_in_flight_is_suspicious() helper for warnings
Change-Id: I617f8cab4e9be7a0b8e8d30b047bf8645393354d
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 15 +++++++++++++++
net/ipv4/tcp_output.c | 26 +++++++++++++++++++++++++-
2 files changed, 40 insertions(+), 1 deletion(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1283,6 +1283,21 @@ static inline bool tcp_skb_sent_after(u6
return t1 > t2 || (t1 == t2 && after(seq1, seq2));
}
+/* If a retransmit failed due to local qdisc congestion or other local issues,
+ * then we may have called tcp_set_skb_tso_segs() to increase the number of
+ * segments in the skb without increasing the tx.in_flight. In all other cases,
+ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We
+ * do not have the state to know whether a retransmit failed due to local qdisc
+ * congestion or other local issues, so to avoid spurious warnings we consider
+ * that any skb marked lost may have suffered that fate.
+ */
+static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
+ u32 skb_sacked_flags,
+ u32 tx_in_flight)
+{
+ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
+}
+
/* These functions determine how the current flow behaves in respect of SACK
* handling. SACK is negotiated with the peer, and therefore it can vary
* between different flows.
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1601,7 +1601,7 @@ int tcp_fragment(struct sock *sk, enum t
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
- int old_factor;
+ int old_factor, inflight_prev;
long limit;
int nlen;
u8 flags;
@@ -1676,6 +1676,30 @@ int tcp_fragment(struct sock *sk, enum t
if (diff)
tcp_adjust_pcount(sk, skb, diff);
+
+ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
+ if (inflight_prev < 0) {
+ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
+ old_factor,
+ TCP_SKB_CB(skb)->sacked,
+ TCP_SKB_CB(skb)->tx.in_flight),
+ "inconsistent: tx.in_flight: %u "
+ "old_factor: %d mss: %u sacked: %u "
+ "1st pcount: %d 2nd pcount: %d "
+ "1st len: %u 2nd len: %u ",
+ TCP_SKB_CB(skb)->tx.in_flight, old_factor,
+ mss_now, TCP_SKB_CB(skb)->sacked,
+ tcp_skb_pcount(skb), tcp_skb_pcount(buff),
+ skb->len, buff->len);
+ inflight_prev = 0;
+ }
+ /* Set 1st tx.in_flight as if 1st were sent by itself: */
+ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
+ tcp_skb_pcount(skb);
+ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
+ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
+ tcp_skb_pcount(skb) +
+ tcp_skb_pcount(buff);
}
/* Link BUFF into the send queue. */

View File

@ -0,0 +1,73 @@
From 65cca0e8fd954a150ec874650af47f7800ea3049 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Wed, 23 May 2018 17:55:54 -0700
Subject: [PATCH 09/19] net-tcp: add new ca opts flag TCP_CONG_WANTS_CE_EVENTS
Add a a new ca opts flag TCP_CONG_WANTS_CE_EVENTS that allows a
congestion control module to receive CE events.
Currently congestion control modules have to set the TCP_CONG_NEEDS_ECN
bit in opts flag to receive CE events but this may incur changes in ECN
behavior elsewhere. This patch adds a new bit TCP_CONG_WANTS_CE_EVENTS
that allows congestion control modules to receive CE events
independently of TCP_CONG_NEEDS_ECN.
Effort: net-tcp
Origin-9xx-SHA1: 9f7e14716cde760bc6c67ef8ef7e1ee48501d95b
Change-Id: I2255506985242f376d910c6fd37daabaf4744f24
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 14 +++++++++++++-
net/ipv4/tcp_input.c | 4 ++--
2 files changed, 15 insertions(+), 3 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1119,7 +1119,11 @@ enum tcp_ca_ack_event_flags {
#define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN 0x2
-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
+#define TCP_CONG_WANTS_CE_EVENTS 0x4
+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \
+ TCP_CONG_NEEDS_ECN | \
+ TCP_CONG_WANTS_CE_EVENTS)
union tcp_cc_info;
@@ -1251,6 +1255,14 @@ static inline char *tcp_ca_get_name_by_k
}
#endif
+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
+ TCP_CONG_WANTS_CE_EVENTS);
+}
+
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct so
tcp_enter_quickack_mode(sk, 2);
break;
case INET_ECN_CE:
- if (tcp_ca_needs_ecn(sk))
+ if (tcp_ca_wants_ce_events(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct so
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
- if (tcp_ca_needs_ecn(sk))
+ if (tcp_ca_wants_ce_events(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN;
break;

View File

@ -0,0 +1,118 @@
From 3acb852e1cfcdeea388bd428c6dd81609fd40792 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 27 Sep 2019 17:10:26 -0400
Subject: [PATCH 10/19] net-tcp: re-generalize TSO sizing in TCP CC module API
Reorganize the API for CC modules so that the CC module once again
gets complete control of the TSO sizing decision. This is how the API
was set up around 2016 and the initial BBRv1 upstreaming. Later Eric
Dumazet simplified it. But with wider testing it now seems that to
avoid CPU regressions BBR needs to have a different TSO sizing
function.
This is necessary to handle cases where there are many flows
bottlenecked on the sender host's NIC, in which case BBR's pacing rate
is much lower than CUBIC/Reno/DCTCP's. Why does this happen? Because
BBR's pacing rate adapts to the low bandwidth share each flow sees. By
contrast, CUBIC/Reno/DCTCP see no loss or ECN, so they grow a very
large cwnd, and thus large pacing rate and large TSO burst size.
Change-Id: Ic8ccfdbe4010ee8d4bf6a6334c48a2fceb2171ea
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 4 ++--
net/ipv4/tcp_bbr.c | 37 ++++++++++++++++++++++++++-----------
net/ipv4/tcp_output.c | 11 +++++------
3 files changed, 33 insertions(+), 19 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1185,8 +1185,8 @@ struct tcp_congestion_ops {
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
- /* override sysctl_tcp_min_tso_segs */
- u32 (*min_tso_segs)(struct sock *sk);
+ /* pick target number of segments per TSO/GSO skb (optional): */
+ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
/* react to a specific lost skb (optional) */
void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -301,20 +301,35 @@ __bpf_kfunc static u32 bbr_min_tso_segs(
return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
}
+/* Return the number of segments BBR would like in a TSO/GSO skb, given
+ * a particular max gso size as a constraint.
+ */
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+ u32 gso_max_size)
+{
+ u32 segs;
+ u64 bytes;
+
+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
+
+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+ segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk));
+ return segs;
+}
+
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
+}
+
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
static u32 bbr_tso_segs_goal(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 segs, bytes;
-
- /* Sort of tcp_tso_autosize() but ignoring
- * driver provided sk_gso_max_size.
- */
- bytes = min_t(unsigned long,
- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
- return min(segs, 0x7FU);
+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE);
}
/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -1150,7 +1165,7 @@ static struct tcp_congestion_ops tcp_bbr
.undo_cwnd = bbr_undo_cwnd,
.cwnd_event = bbr_cwnd_event,
.ssthresh = bbr_ssthresh,
- .min_tso_segs = bbr_min_tso_segs,
+ .tso_segs = bbr_tso_segs,
.get_info = bbr_get_info,
.set_state = bbr_set_state,
};
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2057,13 +2057,12 @@ static u32 tcp_tso_autosize(const struct
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
- u32 min_tso, tso_segs;
+ u32 tso_segs;
- min_tso = ca_ops->min_tso_segs ?
- ca_ops->min_tso_segs(sk) :
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
-
- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+ tso_segs = ca_ops->tso_segs ?
+ ca_ops->tso_segs(sk, mss_now) :
+ tcp_tso_autosize(sk, mss_now,
+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
}

View File

@ -0,0 +1,72 @@
From 3741ada76bab5111cbb9c279cf27e67a0334eb05 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 7 Jan 2024 21:11:26 -0300
Subject: [PATCH 11/19] net-tcp: add fast_ack_mode=1: skip rwin check in
tcp_fast_ack_mode__tcp_ack_snd_check()
Add logic for an experimental TCP connection behavior, enabled with
tp->fast_ack_mode = 1, which disables checking the receive window
before sending an ack in __tcp_ack_snd_check(). If this behavior is
enabled, the data receiver sends an ACK if the amount of data is >
RCV.MSS.
Change-Id: Iaa0a0fd7108221f883137a79d5bfa724f1b096d4
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/linux/tcp.h | 3 ++-
net/ipv4/tcp.c | 1 +
net/ipv4/tcp_cong.c | 1 +
net/ipv4/tcp_input.c | 5 +++--
4 files changed, 7 insertions(+), 3 deletions(-)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -369,7 +369,8 @@ struct tcp_sock {
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
- unused:5;
+ fast_ack_mode:2, /* which fast ack mode ? */
+ unused:3;
u8 thin_lto : 1,/* Use linear timeouts for thin streams */
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3123,6 +3123,7 @@ int tcp_disconnect(struct sock *sk, int
tp->rx_opt.dsack = 0;
tp->rx_opt.num_sacks = 0;
tp->rcv_ooopack = 0;
+ tp->fast_ack_mode = 0;
/* Clean up fastopen related fields */
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_sk(sk)->prior_ssthresh = 0;
+ tcp_sk(sk)->fast_ack_mode = 0;
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
if (tcp_ca_needs_ecn(sk))
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5763,13 +5763,14 @@ static void __tcp_ack_snd_check(struct s
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+ (tp->fast_ack_mode == 1 ||
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise).
* If application uses SO_RCVLOWAT, we want send ack now if
* we have not received enough bytes to satisfy the condition.
*/
- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
- __tcp_select_window(sk) >= tp->rcv_wnd)) ||
+ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+ __tcp_select_window(sk) >= tp->rcv_wnd))) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */

View File

@ -0,0 +1,45 @@
From e5d35b7c882b7001f8a31b14c9f08917230dedc3 Mon Sep 17 00:00:00 2001
From: Jianfeng Wang <jfwang@google.com>
Date: Fri, 19 Jun 2020 17:33:45 +0000
Subject: [PATCH 12/19] net-tcp_bbr: v2: record app-limited status of
TLP-repaired flight
When sending a TLP retransmit, record whether the outstanding flight
of data is application limited. This is important for congestion
control modules that want to respond to losses repaired by TLP
retransmits. This is important because the following scenarios convey
very different information:
(1) a packet loss with a small number of packets in flight;
(2) a packet loss with the maximum amount of data in flight allowed
by the CC module;
Effort: net-tcp_bbr
Change-Id: Ic8ae567caa4e4bfd5fd82c3d4be12a5d9171655e
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/linux/tcp.h | 3 ++-
net/ipv4/tcp_output.c | 1 +
2 files changed, 3 insertions(+), 1 deletion(-)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -370,7 +370,8 @@ struct tcp_sock {
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
fast_ack_mode:2, /* which fast ack mode ? */
- unused:3;
+ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
+ unused:2;
u8 thin_lto : 1,/* Use linear timeouts for thin streams */
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3003,6 +3003,7 @@ void tcp_send_loss_probe(struct sock *sk
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
+ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
if (__tcp_retransmit_skb(sk, skb, 1))
goto rearm_timer;

View File

@ -0,0 +1,45 @@
From 77e7c22b63f8934206b1e89c173558c3967f0779 Mon Sep 17 00:00:00 2001
From: Jianfeng Wang <jfwang@google.com>
Date: Tue, 16 Jun 2020 17:41:19 +0000
Subject: [PATCH 13/19] net-tcp_bbr: v2: inform CC module of losses repaired by
TLP probe
Before this commit, when there is a packet loss that creates a sequence
hole that is filled by a TLP loss probe, then tcp_process_tlp_ack()
only informs the congestion control (CC) module via a back-to-back entry
and exit of CWR. But some congestion control modules (e.g. BBR) do not
respond to CWR events.
This commit adds a new CA event with which the core TCP stack notifies
the CC module when a loss is repaired by a TLP. This will allow CC
modules that do not use the CWR mechanism to have a custom handler for
such TLP recoveries.
Effort: net-tcp_bbr
Change-Id: Ieba72332b401b329bff5a641d2b2043a3fb8f632
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 1 +
net/ipv4/tcp_input.c | 1 +
2 files changed, 2 insertions(+)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1097,6 +1097,7 @@ enum tcp_ca_event {
CA_EVENT_LOSS, /* loss timeout */
CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
+ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */
};
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3859,6 +3859,7 @@ static void tcp_process_tlp_ack(struct s
/* ACK advances: there was a loss, so reduce cwnd. Reset
* tlp_high_seq in tcp_init_cwnd_reduction()
*/
+ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);

View File

@ -0,0 +1,73 @@
From cab22a8e2e87870e8334a12ffcd0ba04ea81126f Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 21 Sep 2020 14:46:26 -0400
Subject: [PATCH 14/19] net-tcp_bbr: v2: introduce is_acking_tlp_retrans_seq
into rate_sample
Introduce is_acking_tlp_retrans_seq into rate_sample. This bool will
export to the CC module the knowledge of whether the current ACK
matched a TLP retransmit.
Note that when this bool is true, we cannot yet tell (in general) whether
this ACK is for the original or the TLP retransmit.
Effort: net-tcp_bbr
Change-Id: I2e6494332167e75efcbdc99bd5c119034e9c39b4
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 1 +
net/ipv4/tcp_input.c | 12 +++++++++---
2 files changed, 10 insertions(+), 3 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1161,6 +1161,7 @@ struct rate_sample {
u32 last_end_seq; /* end_seq of most recently ACKed packet */
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
+ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */
bool is_ack_delayed; /* is this (likely) a delayed ACK? */
bool is_ece; /* did this ACK have ECN marked? */
};
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3842,7 +3842,8 @@ static void tcp_replace_ts_recent(struct
/* This routine deals with acks during a TLP episode and ends an episode by
* resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
*/
-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
+ struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3870,6 +3871,11 @@ static void tcp_process_tlp_ack(struct s
FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
/* Pure dupack: original and TLP probe arrived; no loss */
tp->tlp_high_seq = 0;
+ } else {
+ /* This ACK matches a TLP retransmit. We cannot yet tell if
+ * this ACK is for the original or the TLP retransmit.
+ */
+ rs->is_acking_tlp_retrans_seq = 1;
}
}
@@ -4053,7 +4059,7 @@ static int tcp_ack(struct sock *sk, cons
tcp_rack_update_reo_wnd(sk, &rs);
if (tp->tlp_high_seq)
- tcp_process_tlp_ack(sk, ack, flag);
+ tcp_process_tlp_ack(sk, ack, flag, &rs);
if (tcp_ack_is_dubious(sk, flag)) {
if (!(flag & (FLAG_SND_UNA_ADVANCED |
@@ -4097,7 +4103,7 @@ no_queue:
tcp_ack_probe(sk);
if (tp->tlp_high_seq)
- tcp_process_tlp_ack(sk, ack, flag);
+ tcp_process_tlp_ack(sk, ack, flag, &rs);
return 1;
old_ack:

View File

@ -0,0 +1,112 @@
From 38dd25482f815d949fec91edd7694b2f15823f67 Mon Sep 17 00:00:00 2001
From: David Morley <morleyd@google.com>
Date: Fri, 14 Jul 2023 11:07:56 -0400
Subject: [PATCH 15/19] tcp: introduce per-route feature RTAX_FEATURE_ECN_LOW
Define and implement a new per-route feature, RTAX_FEATURE_ECN_LOW.
This feature indicates that the given destination network is a
low-latency ECN environment, meaning both that ECN CE marks are
applied by the network using a low-latency marking threshold and also
that TCP endpoints provide precise per-data-segment ECN feedback in
ACKs (where the ACK ECE flag echoes the received CE status of all
newly-acknowledged data segments). This feature indication can be used
by congestion control algorithms to decide how to interpret ECN
signals over the given destination network.
This feature is appropriate for datacenter-style ECN marking, such as
the ECN marking approach expected by DCTCP or BBR congestion control
modules.
Signed-off-by: David Morley <morleyd@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Tested-by: David Morley <morleyd@google.com>
Change-Id: I6bc06e9c6cb426fbae7243fc71c9a8c18175f5d3
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 10 ++++++++++
include/uapi/linux/rtnetlink.h | 4 +++-
net/ipv4/tcp_minisocks.c | 2 ++
net/ipv4/tcp_output.c | 6 ++++--
4 files changed, 19 insertions(+), 3 deletions(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -375,6 +375,7 @@ static inline void tcp_dec_quickack_mode
#define TCP_ECN_QUEUE_CWR 2
#define TCP_ECN_DEMAND_CWR 4
#define TCP_ECN_SEEN 8
+#define TCP_ECN_LOW 16
enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
@@ -777,6 +778,15 @@ static inline void tcp_fast_path_check(s
tcp_fast_path_on(tp);
}
+static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
+ const struct dst_entry *dst)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
+ tp->ecn_flags |= TCP_ECN_LOW;
+}
+
u32 tcp_delack_max(const struct sock *sk);
/* Compute the actual rto_min value */
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -507,12 +507,14 @@ enum {
#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */
#define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */
#define RTAX_FEATURE_TCP_USEC_TS (1 << 4)
+#define RTAX_FEATURE_ECN_LOW (1 << 5)
#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \
RTAX_FEATURE_SACK | \
RTAX_FEATURE_TIMESTAMP | \
RTAX_FEATURE_ALLFRAG | \
- RTAX_FEATURE_TCP_USEC_TS)
+ RTAX_FEATURE_TCP_USEC_TS | \
+ RTAX_FEATURE_ECN_LOW)
struct rta_session {
__u8 proto;
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -459,6 +459,8 @@ void tcp_ca_openreq_child(struct sock *s
u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
bool ca_got_dst = false;
+ tcp_set_ecn_low_from_dst(sk, dst);
+
if (ca_key != TCP_CA_UNSPEC) {
const struct tcp_congestion_ops *ca;
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
+ const struct dst_entry *dst = __sk_dst_get(sk);
if (!use_ecn) {
- const struct dst_entry *dst = __sk_dst_get(sk);
-
if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
use_ecn = true;
}
@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock
tp->ecn_flags = TCP_ECN_OK;
if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
INET_ECN_xmit(sk);
+
+ if (dst)
+ tcp_set_ecn_low_from_dst(sk, dst);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,59 @@
From 99e86f904f246ae9ec7a13d1b920eaf4a8c2d47b Mon Sep 17 00:00:00 2001
From: Adithya Abraham Philip <abrahamphilip@google.com>
Date: Fri, 11 Jun 2021 21:56:10 +0000
Subject: [PATCH 17/19] net-tcp_bbr: v3: ensure ECN-enabled BBR flows set ECT
on retransmits
Adds a new flag TCP_ECN_ECT_PERMANENT that is used by CCAs to
indicate that retransmitted packets and pure ACKs must have the
ECT bit set. This is necessary for BBR, which when using
ECN expects ECT to be set even on retransmitted packets and ACKs.
Previous to this addition of TCP_ECN_ECT_PERMANENT, CCAs which can use
ECN but don't "need" it did not have a way to indicate that ECT should
be set on retransmissions/ACKs.
Signed-off-by: Adithya Abraham Philip <abrahamphilip@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Change-Id: I8b048eaab35e136fe6501ef6cd89fd9faa15e6d2
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/net/tcp.h | 1 +
net/ipv4/tcp_bbr.c | 3 +++
net/ipv4/tcp_output.c | 3 ++-
3 files changed, 6 insertions(+), 1 deletion(-)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -376,6 +376,7 @@ static inline void tcp_dec_quickack_mode
#define TCP_ECN_DEMAND_CWR 4
#define TCP_ECN_SEEN 8
#define TCP_ECN_LOW 16
+#define TCP_ECN_ECT_PERMANENT 32
enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -2151,6 +2151,9 @@ __bpf_kfunc static void bbr_init(struct
bbr->plb.pause_until = 0;
tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
+
+ if (bbr_can_use_ecn(sk))
+ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
}
/* BBR marks the current round trip as a loss round. */
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -390,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk
th->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
}
- } else if (!tcp_ca_needs_ecn(sk)) {
+ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
+ !tcp_ca_needs_ecn(sk)) {
/* ACK or retransmitted segment: clear ECT|CE */
INET_ECN_dontxmit(sk);
}

View File

@ -0,0 +1,38 @@
From 5d7cb61552d374bcaaa95022129b4ca1eace1c33 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sun, 23 Jul 2023 23:25:34 -0400
Subject: [PATCH 18/19] tcp: export TCPI_OPT_ECN_LOW in tcp_info tcpi_options
field
Analogous to other important ECN information, export TCPI_OPT_ECN_LOW
in tcp_info tcpi_options field.
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Change-Id: I08d8d8c7e8780e6e37df54038ee50301ac5a0320
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/uapi/linux/tcp.h | 1 +
net/ipv4/tcp.c | 2 ++
2 files changed, 3 insertions(+)
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail {
#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */
#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */
#define TCPI_OPT_USEC_TS 64 /* usec timestamps */
+#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN configured at init */
/*
* Sender's congestion state indicating normal or abnormal situations
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3850,6 +3850,8 @@ void tcp_get_info(struct sock *sk, struc
info->tcpi_options |= TCPI_OPT_ECN;
if (tp->ecn_flags & TCP_ECN_SEEN)
info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+ if (tp->ecn_flags & TCP_ECN_LOW)
+ info->tcpi_options |= TCPI_OPT_ECN_LOW;
if (tp->syn_data_acked)
info->tcpi_options |= TCPI_OPT_SYN_DATA;
if (tp->tcp_usec_ts)

View File

@ -0,0 +1,42 @@
From 39838c2f0b09bec02004c092904aada85da2bc2e Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 11 Mar 2024 12:01:13 -0300
Subject: [PATCH 19/19] x86/cfi,bpf: Add tso_segs and skb_marked_lost to
bpf_struct_ops CFI
Rebased-by: Oleksandr Natalenko <oleksandr@natalenko.name>
[ https://github.com/sirlucjan/kernel-patches/blob/master/6.8/bbr3-patches/0001-tcp-bbr3-initial-import.patch ]
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
net/ipv4/bpf_tcp_ca.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct
{
}
-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
+static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
{
return 0;
}
+static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
+{
+}
+
static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
const struct rate_sample *rs)
{
@@ -340,7 +344,8 @@ static struct tcp_congestion_ops __bpf_o
.cwnd_event = bpf_tcp_ca_cwnd_event,
.in_ack_event = bpf_tcp_ca_in_ack_event,
.pkts_acked = bpf_tcp_ca_pkts_acked,
- .min_tso_segs = bpf_tcp_ca_min_tso_segs,
+ .tso_segs = bpf_tcp_ca_tso_segs,
+ .skb_marked_lost = bpf_tcp_ca_skb_marked_lost,
.cong_control = bpf_tcp_ca_cong_control,
.undo_cwnd = bpf_tcp_ca_undo_cwnd,
.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,

View File

@ -0,0 +1,375 @@
From 60b01019526236e40466cf20bf1192074e5e1a7c Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:27 -0500
Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ANY.
This corresponds to part of the functionality of the NT syscall
NtWaitForMultipleObjects(). Specifically, it implements the behaviour where
the third argument (wait_any) is TRUE, and it does not handle alertable waits.
Those features have been split out into separate patches to ease review.
This patch therefore implements the wait/wake infrastructure which comprises the
core of ntsync's functionality.
NTSYNC_IOC_WAIT_ANY is a vectored wait function similar to poll(). Unlike
poll(), it "consumes" objects when they are signaled. For semaphores, this means
decreasing one from the internal counter. At most one object can be consumed by
this function.
This wait/wake model is fundamentally different from that used anywhere else in
the kernel, and for that reason ntsync does not use any existing infrastructure,
such as futexes, kernel mutexes or semaphores, or wait_event().
Up to 64 objects can be waited on at once. As soon as one is signaled, the
object with the lowest index is consumed, and that index is returned via the
"index" field.
A timeout is supported. The timeout is passed as a u64 nanosecond value, which
represents absolute time measured against either the MONOTONIC or REALTIME clock
(controlled by the flags argument). If U64_MAX is passed, the ioctl waits
indefinitely.
This ioctl validates that all objects belong to the relevant device. This is not
necessary for any technical reason related to NTSYNC_IOC_WAIT_ANY, but will be
necessary for NTSYNC_IOC_WAIT_ALL introduced in the following patch.
Some padding fields are added for alignment and for fields which will be added
in future patches (split out to ease review).
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 245 ++++++++++++++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 14 +++
2 files changed, 259 insertions(+)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -6,11 +6,16 @@
*/
#include <linux/anon_inodes.h>
+#include <linux/atomic.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/overflow.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <uapi/linux/ntsync.h>
@@ -30,6 +35,8 @@ enum ntsync_type {
*
* Both rely on struct file for reference counting. Individual
* ntsync_obj objects take a reference to the device when created.
+ * Wait operations take a reference to each object being waited on for
+ * the duration of the wait.
*/
struct ntsync_obj {
@@ -47,12 +54,55 @@ struct ntsync_obj {
__u32 max;
} sem;
} u;
+
+ struct list_head any_waiters;
+};
+
+struct ntsync_q_entry {
+ struct list_head node;
+ struct ntsync_q *q;
+ struct ntsync_obj *obj;
+ __u32 index;
+};
+
+struct ntsync_q {
+ struct task_struct *task;
+
+ /*
+ * Protected via atomic_try_cmpxchg(). Only the thread that wins the
+ * compare-and-swap may actually change object states and wake this
+ * task.
+ */
+ atomic_t signaled;
+
+ __u32 count;
+ struct ntsync_q_entry entries[];
};
struct ntsync_device {
struct file *file;
};
+static void try_wake_any_sem(struct ntsync_obj *sem)
+{
+ struct ntsync_q_entry *entry;
+
+ lockdep_assert_held(&sem->lock);
+
+ list_for_each_entry(entry, &sem->any_waiters, node) {
+ struct ntsync_q *q = entry->q;
+ int signaled = -1;
+
+ if (!sem->u.sem.count)
+ break;
+
+ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
+ sem->u.sem.count--;
+ wake_up_process(q->task);
+ }
+ }
+}
+
/*
* Actually change the semaphore state, returning -EOVERFLOW if it is made
* invalid.
@@ -88,6 +138,8 @@ static int ntsync_sem_post(struct ntsync
prev_count = sem->u.sem.count;
ret = post_sem_state(sem, args);
+ if (!ret)
+ try_wake_any_sem(sem);
spin_unlock(&sem->lock);
@@ -141,6 +193,7 @@ static struct ntsync_obj *ntsync_alloc_o
obj->dev = dev;
get_file(dev->file);
spin_lock_init(&obj->lock);
+ INIT_LIST_HEAD(&obj->any_waiters);
return obj;
}
@@ -191,6 +244,196 @@ static int ntsync_create_sem(struct ntsy
return put_user(fd, &user_args->sem);
}
+static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
+{
+ struct file *file = fget(fd);
+ struct ntsync_obj *obj;
+
+ if (!file)
+ return NULL;
+
+ if (file->f_op != &ntsync_obj_fops) {
+ fput(file);
+ return NULL;
+ }
+
+ obj = file->private_data;
+ if (obj->dev != dev) {
+ fput(file);
+ return NULL;
+ }
+
+ return obj;
+}
+
+static void put_obj(struct ntsync_obj *obj)
+{
+ fput(obj->file);
+}
+
+static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args)
+{
+ ktime_t timeout = ns_to_ktime(args->timeout);
+ clockid_t clock = CLOCK_MONOTONIC;
+ ktime_t *timeout_ptr;
+ int ret = 0;
+
+ timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout);
+
+ if (args->flags & NTSYNC_WAIT_REALTIME)
+ clock = CLOCK_REALTIME;
+
+ do {
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (atomic_read(&q->signaled) != -1) {
+ ret = 0;
+ break;
+ }
+ ret = schedule_hrtimeout_range_clock(timeout_ptr, 0, HRTIMER_MODE_ABS, clock);
+ } while (ret < 0);
+ __set_current_state(TASK_RUNNING);
+
+ return ret;
+}
+
+/*
+ * Allocate and initialize the ntsync_q structure, but do not queue us yet.
+ */
+static int setup_wait(struct ntsync_device *dev,
+ const struct ntsync_wait_args *args,
+ struct ntsync_q **ret_q)
+{
+ const __u32 count = args->count;
+ int fds[NTSYNC_MAX_WAIT_COUNT];
+ struct ntsync_q *q;
+ __u32 i, j;
+
+ if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME))
+ return -EINVAL;
+
+ if (args->count > NTSYNC_MAX_WAIT_COUNT)
+ return -EINVAL;
+
+ if (copy_from_user(fds, u64_to_user_ptr(args->objs),
+ array_size(count, sizeof(*fds))))
+ return -EFAULT;
+
+ q = kmalloc(struct_size(q, entries, count), GFP_KERNEL);
+ if (!q)
+ return -ENOMEM;
+ q->task = current;
+ atomic_set(&q->signaled, -1);
+ q->count = count;
+
+ for (i = 0; i < count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = get_obj(dev, fds[i]);
+
+ if (!obj)
+ goto err;
+
+ entry->obj = obj;
+ entry->q = q;
+ entry->index = i;
+ }
+
+ *ret_q = q;
+ return 0;
+
+err:
+ for (j = 0; j < i; j++)
+ put_obj(q->entries[j].obj);
+ kfree(q);
+ return -EINVAL;
+}
+
+static void try_wake_any_obj(struct ntsync_obj *obj)
+{
+ switch (obj->type) {
+ case NTSYNC_TYPE_SEM:
+ try_wake_any_sem(obj);
+ break;
+ }
+}
+
+static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp)
+{
+ struct ntsync_wait_args args;
+ struct ntsync_q *q;
+ int signaled;
+ __u32 i;
+ int ret;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ ret = setup_wait(dev, &args, &q);
+ if (ret < 0)
+ return ret;
+
+ /* queue ourselves */
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = entry->obj;
+
+ spin_lock(&obj->lock);
+ list_add_tail(&entry->node, &obj->any_waiters);
+ spin_unlock(&obj->lock);
+ }
+
+ /* check if we are already signaled */
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_obj *obj = q->entries[i].obj;
+
+ if (atomic_read(&q->signaled) != -1)
+ break;
+
+ spin_lock(&obj->lock);
+ try_wake_any_obj(obj);
+ spin_unlock(&obj->lock);
+ }
+
+ /* sleep */
+
+ ret = ntsync_schedule(q, &args);
+
+ /* and finally, unqueue */
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = entry->obj;
+
+ spin_lock(&obj->lock);
+ list_del(&entry->node);
+ spin_unlock(&obj->lock);
+
+ put_obj(obj);
+ }
+
+ signaled = atomic_read(&q->signaled);
+ if (signaled != -1) {
+ struct ntsync_wait_args __user *user_args = argp;
+
+ /* even if we caught a signal, we need to communicate success */
+ ret = 0;
+
+ if (put_user(signaled, &user_args->index))
+ ret = -EFAULT;
+ } else if (!ret) {
+ ret = -ETIMEDOUT;
+ }
+
+ kfree(q);
+ return ret;
+}
+
static int ntsync_char_open(struct inode *inode, struct file *file)
{
struct ntsync_device *dev;
@@ -222,6 +465,8 @@ static long ntsync_char_ioctl(struct fil
switch (cmd) {
case NTSYNC_IOC_CREATE_SEM:
return ntsync_create_sem(dev, argp);
+ case NTSYNC_IOC_WAIT_ANY:
+ return ntsync_wait_any(dev, argp);
default:
return -ENOIOCTLCMD;
}
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -16,7 +16,21 @@ struct ntsync_sem_args {
__u32 max;
};
+#define NTSYNC_WAIT_REALTIME 0x1
+
+struct ntsync_wait_args {
+ __u64 timeout;
+ __u64 objs;
+ __u32 count;
+ __u32 index;
+ __u32 flags;
+ __u32 pad[3];
+};
+
+#define NTSYNC_MAX_WAIT_COUNT 64
+
#define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args)
+#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args)
#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32)

View File

@ -0,0 +1,532 @@
From 73fc33606fcb7028ec1ee6027a361de4e85ab5d6 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:28 -0500
Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ALL.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This is similar to NTSYNC_IOC_WAIT_ANY, but waits until all of the objects are
simultaneously signaled, and then acquires all of them as a single atomic
operation.
Because acquisition of multiple objects is atomic, some complex locking is
required. We cannot simply spin-lock multiple objects simultaneously, as that
may disable preëmption for a problematically long time.
Instead, modifying any object which may be involved in a wait-all operation takes
a device-wide sleeping mutex, "wait_all_lock", instead of the normal object
spinlock.
Because wait-for-all is a rare operation, in order to optimize wait-for-any,
this lock is only taken when necessary. "all_hint" is used to mark objects which
are involved in a wait-for-all operation, and if an object is not, only its
spinlock is taken.
The locking scheme used here was written by Peter Zijlstra.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 334 ++++++++++++++++++++++++++++++++++--
include/uapi/linux/ntsync.h | 1 +
2 files changed, 322 insertions(+), 13 deletions(-)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -13,6 +13,7 @@
#include <linux/ktime.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/overflow.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
@@ -41,6 +42,7 @@ enum ntsync_type {
struct ntsync_obj {
spinlock_t lock;
+ int dev_locked;
enum ntsync_type type;
@@ -55,7 +57,30 @@ struct ntsync_obj {
} sem;
} u;
+ /*
+ * any_waiters is protected by the object lock, but all_waiters is
+ * protected by the device wait_all_lock.
+ */
struct list_head any_waiters;
+ struct list_head all_waiters;
+
+ /*
+ * Hint describing how many tasks are queued on this object in a
+ * wait-all operation.
+ *
+ * Any time we do a wake, we may need to wake "all" waiters as well as
+ * "any" waiters. In order to atomically wake "all" waiters, we must
+ * lock all of the objects, and that means grabbing the wait_all_lock
+ * below (and, due to lock ordering rules, before locking this object).
+ * However, wait-all is a rare operation, and grabbing the wait-all
+ * lock for every wake would create unnecessary contention.
+ * Therefore we first check whether all_hint is zero, and, if it is,
+ * we skip trying to wake "all" waiters.
+ *
+ * Since wait requests must originate from user-space threads, we're
+ * limited here by PID_MAX_LIMIT, so there's no risk of overflow.
+ */
+ atomic_t all_hint;
};
struct ntsync_q_entry {
@@ -75,19 +100,198 @@ struct ntsync_q {
*/
atomic_t signaled;
+ bool all;
__u32 count;
struct ntsync_q_entry entries[];
};
struct ntsync_device {
+ /*
+ * Wait-all operations must atomically grab all objects, and be totally
+ * ordered with respect to each other and wait-any operations.
+ * If one thread is trying to acquire several objects, another thread
+ * cannot touch the object at the same time.
+ *
+ * This device-wide lock is used to serialize wait-for-all
+ * operations, and operations on an object that is involved in a
+ * wait-for-all.
+ */
+ struct mutex wait_all_lock;
+
struct file *file;
};
+/*
+ * Single objects are locked using obj->lock.
+ *
+ * Multiple objects are 'locked' while holding dev->wait_all_lock.
+ * In this case however, individual objects are not locked by holding
+ * obj->lock, but by setting obj->dev_locked.
+ *
+ * This means that in order to lock a single object, the sequence is slightly
+ * more complicated than usual. Specifically it needs to check obj->dev_locked
+ * after acquiring obj->lock, if set, it needs to drop the lock and acquire
+ * dev->wait_all_lock in order to serialize against the multi-object operation.
+ */
+
+static void dev_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
+{
+ lockdep_assert_held(&dev->wait_all_lock);
+ lockdep_assert(obj->dev == dev);
+ spin_lock(&obj->lock);
+ /*
+ * By setting obj->dev_locked inside obj->lock, it is ensured that
+ * anyone holding obj->lock must see the value.
+ */
+ obj->dev_locked = 1;
+ spin_unlock(&obj->lock);
+}
+
+static void dev_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
+{
+ lockdep_assert_held(&dev->wait_all_lock);
+ lockdep_assert(obj->dev == dev);
+ spin_lock(&obj->lock);
+ obj->dev_locked = 0;
+ spin_unlock(&obj->lock);
+}
+
+static void obj_lock(struct ntsync_obj *obj)
+{
+ struct ntsync_device *dev = obj->dev;
+
+ for (;;) {
+ spin_lock(&obj->lock);
+ if (likely(!obj->dev_locked))
+ break;
+
+ spin_unlock(&obj->lock);
+ mutex_lock(&dev->wait_all_lock);
+ spin_lock(&obj->lock);
+ /*
+ * obj->dev_locked should be set and released under the same
+ * wait_all_lock section, since we now own this lock, it should
+ * be clear.
+ */
+ lockdep_assert(!obj->dev_locked);
+ spin_unlock(&obj->lock);
+ mutex_unlock(&dev->wait_all_lock);
+ }
+}
+
+static void obj_unlock(struct ntsync_obj *obj)
+{
+ spin_unlock(&obj->lock);
+}
+
+static bool ntsync_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
+{
+ bool all;
+
+ obj_lock(obj);
+ all = atomic_read(&obj->all_hint);
+ if (unlikely(all)) {
+ obj_unlock(obj);
+ mutex_lock(&dev->wait_all_lock);
+ dev_lock_obj(dev, obj);
+ }
+
+ return all;
+}
+
+static void ntsync_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj, bool all)
+{
+ if (all) {
+ dev_unlock_obj(dev, obj);
+ mutex_unlock(&dev->wait_all_lock);
+ } else {
+ obj_unlock(obj);
+ }
+}
+
+#define ntsync_assert_held(obj) \
+ lockdep_assert((lockdep_is_held(&(obj)->lock) != LOCK_STATE_NOT_HELD) || \
+ ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \
+ (obj)->dev_locked))
+
+static bool is_signaled(struct ntsync_obj *obj)
+{
+ ntsync_assert_held(obj);
+
+ switch (obj->type) {
+ case NTSYNC_TYPE_SEM:
+ return !!obj->u.sem.count;
+ }
+
+ WARN(1, "bad object type %#x\n", obj->type);
+ return false;
+}
+
+/*
+ * "locked_obj" is an optional pointer to an object which is already locked and
+ * should not be locked again. This is necessary so that changing an object's
+ * state and waking it can be a single atomic operation.
+ */
+static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q,
+ struct ntsync_obj *locked_obj)
+{
+ __u32 count = q->count;
+ bool can_wake = true;
+ int signaled = -1;
+ __u32 i;
+
+ lockdep_assert_held(&dev->wait_all_lock);
+ if (locked_obj)
+ lockdep_assert(locked_obj->dev_locked);
+
+ for (i = 0; i < count; i++) {
+ if (q->entries[i].obj != locked_obj)
+ dev_lock_obj(dev, q->entries[i].obj);
+ }
+
+ for (i = 0; i < count; i++) {
+ if (!is_signaled(q->entries[i].obj)) {
+ can_wake = false;
+ break;
+ }
+ }
+
+ if (can_wake && atomic_try_cmpxchg(&q->signaled, &signaled, 0)) {
+ for (i = 0; i < count; i++) {
+ struct ntsync_obj *obj = q->entries[i].obj;
+
+ switch (obj->type) {
+ case NTSYNC_TYPE_SEM:
+ obj->u.sem.count--;
+ break;
+ }
+ }
+ wake_up_process(q->task);
+ }
+
+ for (i = 0; i < count; i++) {
+ if (q->entries[i].obj != locked_obj)
+ dev_unlock_obj(dev, q->entries[i].obj);
+ }
+}
+
+static void try_wake_all_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
+{
+ struct ntsync_q_entry *entry;
+
+ lockdep_assert_held(&dev->wait_all_lock);
+ lockdep_assert(obj->dev_locked);
+
+ list_for_each_entry(entry, &obj->all_waiters, node)
+ try_wake_all(dev, entry->q, obj);
+}
+
static void try_wake_any_sem(struct ntsync_obj *sem)
{
struct ntsync_q_entry *entry;
- lockdep_assert_held(&sem->lock);
+ ntsync_assert_held(sem);
+ lockdep_assert(sem->type == NTSYNC_TYPE_SEM);
list_for_each_entry(entry, &sem->any_waiters, node) {
struct ntsync_q *q = entry->q;
@@ -111,7 +315,7 @@ static int post_sem_state(struct ntsync_
{
__u32 sum;
- lockdep_assert_held(&sem->lock);
+ ntsync_assert_held(sem);
if (check_add_overflow(sem->u.sem.count, count, &sum) ||
sum > sem->u.sem.max)
@@ -123,9 +327,11 @@ static int post_sem_state(struct ntsync_
static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
{
+ struct ntsync_device *dev = sem->dev;
__u32 __user *user_args = argp;
__u32 prev_count;
__u32 args;
+ bool all;
int ret;
if (copy_from_user(&args, argp, sizeof(args)))
@@ -134,14 +340,17 @@ static int ntsync_sem_post(struct ntsync
if (sem->type != NTSYNC_TYPE_SEM)
return -EINVAL;
- spin_lock(&sem->lock);
+ all = ntsync_lock_obj(dev, sem);
prev_count = sem->u.sem.count;
ret = post_sem_state(sem, args);
- if (!ret)
+ if (!ret) {
+ if (all)
+ try_wake_all_obj(dev, sem);
try_wake_any_sem(sem);
+ }
- spin_unlock(&sem->lock);
+ ntsync_unlock_obj(dev, sem, all);
if (!ret && put_user(prev_count, user_args))
ret = -EFAULT;
@@ -194,6 +403,8 @@ static struct ntsync_obj *ntsync_alloc_o
get_file(dev->file);
spin_lock_init(&obj->lock);
INIT_LIST_HEAD(&obj->any_waiters);
+ INIT_LIST_HEAD(&obj->all_waiters);
+ atomic_set(&obj->all_hint, 0);
return obj;
}
@@ -305,7 +516,7 @@ static int ntsync_schedule(const struct
* Allocate and initialize the ntsync_q structure, but do not queue us yet.
*/
static int setup_wait(struct ntsync_device *dev,
- const struct ntsync_wait_args *args,
+ const struct ntsync_wait_args *args, bool all,
struct ntsync_q **ret_q)
{
const __u32 count = args->count;
@@ -328,6 +539,7 @@ static int setup_wait(struct ntsync_devi
return -ENOMEM;
q->task = current;
atomic_set(&q->signaled, -1);
+ q->all = all;
q->count = count;
for (i = 0; i < count; i++) {
@@ -337,6 +549,16 @@ static int setup_wait(struct ntsync_devi
if (!obj)
goto err;
+ if (all) {
+ /* Check that the objects are all distinct. */
+ for (j = 0; j < i; j++) {
+ if (obj == q->entries[j].obj) {
+ put_obj(obj);
+ goto err;
+ }
+ }
+ }
+
entry->obj = obj;
entry->q = q;
entry->index = i;
@@ -366,13 +588,14 @@ static int ntsync_wait_any(struct ntsync
struct ntsync_wait_args args;
struct ntsync_q *q;
int signaled;
+ bool all;
__u32 i;
int ret;
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
- ret = setup_wait(dev, &args, &q);
+ ret = setup_wait(dev, &args, false, &q);
if (ret < 0)
return ret;
@@ -382,9 +605,9 @@ static int ntsync_wait_any(struct ntsync
struct ntsync_q_entry *entry = &q->entries[i];
struct ntsync_obj *obj = entry->obj;
- spin_lock(&obj->lock);
+ all = ntsync_lock_obj(dev, obj);
list_add_tail(&entry->node, &obj->any_waiters);
- spin_unlock(&obj->lock);
+ ntsync_unlock_obj(dev, obj, all);
}
/* check if we are already signaled */
@@ -395,9 +618,9 @@ static int ntsync_wait_any(struct ntsync
if (atomic_read(&q->signaled) != -1)
break;
- spin_lock(&obj->lock);
+ all = ntsync_lock_obj(dev, obj);
try_wake_any_obj(obj);
- spin_unlock(&obj->lock);
+ ntsync_unlock_obj(dev, obj, all);
}
/* sleep */
@@ -410,13 +633,94 @@ static int ntsync_wait_any(struct ntsync
struct ntsync_q_entry *entry = &q->entries[i];
struct ntsync_obj *obj = entry->obj;
- spin_lock(&obj->lock);
+ all = ntsync_lock_obj(dev, obj);
list_del(&entry->node);
- spin_unlock(&obj->lock);
+ ntsync_unlock_obj(dev, obj, all);
+
+ put_obj(obj);
+ }
+
+ signaled = atomic_read(&q->signaled);
+ if (signaled != -1) {
+ struct ntsync_wait_args __user *user_args = argp;
+
+ /* even if we caught a signal, we need to communicate success */
+ ret = 0;
+
+ if (put_user(signaled, &user_args->index))
+ ret = -EFAULT;
+ } else if (!ret) {
+ ret = -ETIMEDOUT;
+ }
+
+ kfree(q);
+ return ret;
+}
+
+static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp)
+{
+ struct ntsync_wait_args args;
+ struct ntsync_q *q;
+ int signaled;
+ __u32 i;
+ int ret;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ ret = setup_wait(dev, &args, true, &q);
+ if (ret < 0)
+ return ret;
+
+ /* queue ourselves */
+
+ mutex_lock(&dev->wait_all_lock);
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = entry->obj;
+
+ atomic_inc(&obj->all_hint);
+
+ /*
+ * obj->all_waiters is protected by dev->wait_all_lock rather
+ * than obj->lock, so there is no need to acquire obj->lock
+ * here.
+ */
+ list_add_tail(&entry->node, &obj->all_waiters);
+ }
+
+ /* check if we are already signaled */
+
+ try_wake_all(dev, q, NULL);
+
+ mutex_unlock(&dev->wait_all_lock);
+
+ /* sleep */
+
+ ret = ntsync_schedule(q, &args);
+
+ /* and finally, unqueue */
+
+ mutex_lock(&dev->wait_all_lock);
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = entry->obj;
+
+ /*
+ * obj->all_waiters is protected by dev->wait_all_lock rather
+ * than obj->lock, so there is no need to acquire it here.
+ */
+ list_del(&entry->node);
+
+ atomic_dec(&obj->all_hint);
put_obj(obj);
}
+ mutex_unlock(&dev->wait_all_lock);
+
signaled = atomic_read(&q->signaled);
if (signaled != -1) {
struct ntsync_wait_args __user *user_args = argp;
@@ -442,6 +746,8 @@ static int ntsync_char_open(struct inode
if (!dev)
return -ENOMEM;
+ mutex_init(&dev->wait_all_lock);
+
file->private_data = dev;
dev->file = file;
return nonseekable_open(inode, file);
@@ -465,6 +771,8 @@ static long ntsync_char_ioctl(struct fil
switch (cmd) {
case NTSYNC_IOC_CREATE_SEM:
return ntsync_create_sem(dev, argp);
+ case NTSYNC_IOC_WAIT_ALL:
+ return ntsync_wait_all(dev, argp);
case NTSYNC_IOC_WAIT_ANY:
return ntsync_wait_any(dev, argp);
default:
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -31,6 +31,7 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args)
#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args)
+#define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args)
#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32)

View File

@ -0,0 +1,226 @@
From fdeceab49078a80987c665ed837ee4f1b8a942a8 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:29 -0500
Subject: ntsync: Introduce NTSYNC_IOC_CREATE_MUTEX.
This corresponds to the NT syscall NtCreateMutant().
An NT mutex is recursive, with a 32-bit recursion counter. When acquired via
NtWaitForMultipleObjects(), the recursion counter is incremented by one. The OS
records the thread which acquired it.
The OS records the thread which acquired it. However, in order to keep this
driver self-contained, the owning thread ID is managed by user-space, and passed
as a parameter to all relevant ioctls.
The initial owner and recursion count, if any, are specified when the mutex is
created.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 77 +++++++++++++++++++++++++++++++++++--
include/uapi/linux/ntsync.h | 10 ++++-
2 files changed, 83 insertions(+), 4 deletions(-)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -25,6 +25,7 @@
enum ntsync_type {
NTSYNC_TYPE_SEM,
+ NTSYNC_TYPE_MUTEX,
};
/*
@@ -55,6 +56,10 @@ struct ntsync_obj {
__u32 count;
__u32 max;
} sem;
+ struct {
+ __u32 count;
+ pid_t owner;
+ } mutex;
} u;
/*
@@ -92,6 +97,7 @@ struct ntsync_q_entry {
struct ntsync_q {
struct task_struct *task;
+ __u32 owner;
/*
* Protected via atomic_try_cmpxchg(). Only the thread that wins the
@@ -214,13 +220,17 @@ static void ntsync_unlock_obj(struct nts
((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \
(obj)->dev_locked))
-static bool is_signaled(struct ntsync_obj *obj)
+static bool is_signaled(struct ntsync_obj *obj, __u32 owner)
{
ntsync_assert_held(obj);
switch (obj->type) {
case NTSYNC_TYPE_SEM:
return !!obj->u.sem.count;
+ case NTSYNC_TYPE_MUTEX:
+ if (obj->u.mutex.owner && obj->u.mutex.owner != owner)
+ return false;
+ return obj->u.mutex.count < UINT_MAX;
}
WARN(1, "bad object type %#x\n", obj->type);
@@ -250,7 +260,7 @@ static void try_wake_all(struct ntsync_d
}
for (i = 0; i < count; i++) {
- if (!is_signaled(q->entries[i].obj)) {
+ if (!is_signaled(q->entries[i].obj, q->owner)) {
can_wake = false;
break;
}
@@ -264,6 +274,10 @@ static void try_wake_all(struct ntsync_d
case NTSYNC_TYPE_SEM:
obj->u.sem.count--;
break;
+ case NTSYNC_TYPE_MUTEX:
+ obj->u.mutex.count++;
+ obj->u.mutex.owner = q->owner;
+ break;
}
}
wake_up_process(q->task);
@@ -307,6 +321,30 @@ static void try_wake_any_sem(struct ntsy
}
}
+static void try_wake_any_mutex(struct ntsync_obj *mutex)
+{
+ struct ntsync_q_entry *entry;
+
+ ntsync_assert_held(mutex);
+ lockdep_assert(mutex->type == NTSYNC_TYPE_MUTEX);
+
+ list_for_each_entry(entry, &mutex->any_waiters, node) {
+ struct ntsync_q *q = entry->q;
+ int signaled = -1;
+
+ if (mutex->u.mutex.count == UINT_MAX)
+ break;
+ if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner)
+ continue;
+
+ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
+ mutex->u.mutex.count++;
+ mutex->u.mutex.owner = q->owner;
+ wake_up_process(q->task);
+ }
+ }
+}
+
/*
* Actually change the semaphore state, returning -EOVERFLOW if it is made
* invalid.
@@ -455,6 +493,33 @@ static int ntsync_create_sem(struct ntsy
return put_user(fd, &user_args->sem);
}
+static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp)
+{
+ struct ntsync_mutex_args __user *user_args = argp;
+ struct ntsync_mutex_args args;
+ struct ntsync_obj *mutex;
+ int fd;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ if (!args.owner != !args.count)
+ return -EINVAL;
+
+ mutex = ntsync_alloc_obj(dev, NTSYNC_TYPE_MUTEX);
+ if (!mutex)
+ return -ENOMEM;
+ mutex->u.mutex.count = args.count;
+ mutex->u.mutex.owner = args.owner;
+ fd = ntsync_obj_get_fd(mutex);
+ if (fd < 0) {
+ kfree(mutex);
+ return fd;
+ }
+
+ return put_user(fd, &user_args->mutex);
+}
+
static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
{
struct file *file = fget(fd);
@@ -524,7 +589,7 @@ static int setup_wait(struct ntsync_devi
struct ntsync_q *q;
__u32 i, j;
- if (args->pad[0] || args->pad[1] || args->pad[2] || (args->flags & ~NTSYNC_WAIT_REALTIME))
+ if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME))
return -EINVAL;
if (args->count > NTSYNC_MAX_WAIT_COUNT)
@@ -538,6 +603,7 @@ static int setup_wait(struct ntsync_devi
if (!q)
return -ENOMEM;
q->task = current;
+ q->owner = args->owner;
atomic_set(&q->signaled, -1);
q->all = all;
q->count = count;
@@ -580,6 +646,9 @@ static void try_wake_any_obj(struct ntsy
case NTSYNC_TYPE_SEM:
try_wake_any_sem(obj);
break;
+ case NTSYNC_TYPE_MUTEX:
+ try_wake_any_mutex(obj);
+ break;
}
}
@@ -769,6 +838,8 @@ static long ntsync_char_ioctl(struct fil
void __user *argp = (void __user *)parm;
switch (cmd) {
+ case NTSYNC_IOC_CREATE_MUTEX:
+ return ntsync_create_mutex(dev, argp);
case NTSYNC_IOC_CREATE_SEM:
return ntsync_create_sem(dev, argp);
case NTSYNC_IOC_WAIT_ALL:
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -16,6 +16,12 @@ struct ntsync_sem_args {
__u32 max;
};
+struct ntsync_mutex_args {
+ __u32 mutex;
+ __u32 owner;
+ __u32 count;
+};
+
#define NTSYNC_WAIT_REALTIME 0x1
struct ntsync_wait_args {
@@ -24,7 +30,8 @@ struct ntsync_wait_args {
__u32 count;
__u32 index;
__u32 flags;
- __u32 pad[3];
+ __u32 owner;
+ __u32 pad[2];
};
#define NTSYNC_MAX_WAIT_COUNT 64
@@ -32,6 +39,7 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args)
#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args)
#define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args)
+#define NTSYNC_IOC_CREATE_MUTEX _IOWR('N', 0x84, struct ntsync_sem_args)
#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32)

View File

@ -0,0 +1,95 @@
From cc9ade623cd90cd002fb86f3aa249af2e6e4019e Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:30 -0500
Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_UNLOCK.
This corresponds to the NT syscall NtReleaseMutant().
This syscall decrements the mutex's recursion count by one, and returns the
previous value. If the mutex is not owned by the current task, the function
instead fails and returns -EPERM.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 53 +++++++++++++++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 1 +
2 files changed, 54 insertions(+)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -396,6 +396,57 @@ static int ntsync_sem_post(struct ntsync
return ret;
}
+/*
+ * Actually change the mutex state, returning -EPERM if not the owner.
+ */
+static int unlock_mutex_state(struct ntsync_obj *mutex,
+ const struct ntsync_mutex_args *args)
+{
+ ntsync_assert_held(mutex);
+
+ if (mutex->u.mutex.owner != args->owner)
+ return -EPERM;
+
+ if (!--mutex->u.mutex.count)
+ mutex->u.mutex.owner = 0;
+ return 0;
+}
+
+static int ntsync_mutex_unlock(struct ntsync_obj *mutex, void __user *argp)
+{
+ struct ntsync_mutex_args __user *user_args = argp;
+ struct ntsync_device *dev = mutex->dev;
+ struct ntsync_mutex_args args;
+ __u32 prev_count;
+ bool all;
+ int ret;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ if (!args.owner)
+ return -EINVAL;
+
+ if (mutex->type != NTSYNC_TYPE_MUTEX)
+ return -EINVAL;
+
+ all = ntsync_lock_obj(dev, mutex);
+
+ prev_count = mutex->u.mutex.count;
+ ret = unlock_mutex_state(mutex, &args);
+ if (!ret) {
+ if (all)
+ try_wake_all_obj(dev, mutex);
+ try_wake_any_mutex(mutex);
+ }
+
+ ntsync_unlock_obj(dev, mutex, all);
+
+ if (!ret && put_user(prev_count, &user_args->count))
+ ret = -EFAULT;
+
+ return ret;
+}
+
static int ntsync_obj_release(struct inode *inode, struct file *file)
{
struct ntsync_obj *obj = file->private_data;
@@ -415,6 +466,8 @@ static long ntsync_obj_ioctl(struct file
switch (cmd) {
case NTSYNC_IOC_SEM_POST:
return ntsync_sem_post(obj, argp);
+ case NTSYNC_IOC_MUTEX_UNLOCK:
+ return ntsync_mutex_unlock(obj, argp);
default:
return -ENOIOCTLCMD;
}
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -42,5 +42,6 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_CREATE_MUTEX _IOWR('N', 0x84, struct ntsync_sem_args)
#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32)
+#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args)
#endif

View File

@ -0,0 +1,156 @@
From dca3fe766afa42e34f5d3f62c0f2850760663176 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:31 -0500
Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_KILL.
This does not correspond to any NT syscall. Rather, when a thread dies, it
should be called by the NT emulator for each mutex, with the TID of the dying
thread.
NT mutexes are robust (in the pthread sense). When an NT thread dies, any
mutexes it owned are immediately released. Acquisition of those mutexes by other
threads will return a special value indicating that the mutex was abandoned,
like EOWNERDEAD returned from pthread_mutex_lock(), and EOWNERDEAD is indeed
used here for that purpose.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 61 +++++++++++++++++++++++++++++++++++--
include/uapi/linux/ntsync.h | 1 +
2 files changed, 60 insertions(+), 2 deletions(-)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -59,6 +59,7 @@ struct ntsync_obj {
struct {
__u32 count;
pid_t owner;
+ bool ownerdead;
} mutex;
} u;
@@ -107,6 +108,7 @@ struct ntsync_q {
atomic_t signaled;
bool all;
+ bool ownerdead;
__u32 count;
struct ntsync_q_entry entries[];
};
@@ -275,6 +277,9 @@ static void try_wake_all(struct ntsync_d
obj->u.sem.count--;
break;
case NTSYNC_TYPE_MUTEX:
+ if (obj->u.mutex.ownerdead)
+ q->ownerdead = true;
+ obj->u.mutex.ownerdead = false;
obj->u.mutex.count++;
obj->u.mutex.owner = q->owner;
break;
@@ -338,6 +343,9 @@ static void try_wake_any_mutex(struct nt
continue;
if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
+ if (mutex->u.mutex.ownerdead)
+ q->ownerdead = true;
+ mutex->u.mutex.ownerdead = false;
mutex->u.mutex.count++;
mutex->u.mutex.owner = q->owner;
wake_up_process(q->task);
@@ -447,6 +455,52 @@ static int ntsync_mutex_unlock(struct nt
return ret;
}
+/*
+ * Actually change the mutex state to mark its owner as dead,
+ * returning -EPERM if not the owner.
+ */
+static int kill_mutex_state(struct ntsync_obj *mutex, __u32 owner)
+{
+ ntsync_assert_held(mutex);
+
+ if (mutex->u.mutex.owner != owner)
+ return -EPERM;
+
+ mutex->u.mutex.ownerdead = true;
+ mutex->u.mutex.owner = 0;
+ mutex->u.mutex.count = 0;
+ return 0;
+}
+
+static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp)
+{
+ struct ntsync_device *dev = mutex->dev;
+ __u32 owner;
+ bool all;
+ int ret;
+
+ if (get_user(owner, (__u32 __user *)argp))
+ return -EFAULT;
+ if (!owner)
+ return -EINVAL;
+
+ if (mutex->type != NTSYNC_TYPE_MUTEX)
+ return -EINVAL;
+
+ all = ntsync_lock_obj(dev, mutex);
+
+ ret = kill_mutex_state(mutex, owner);
+ if (!ret) {
+ if (all)
+ try_wake_all_obj(dev, mutex);
+ try_wake_any_mutex(mutex);
+ }
+
+ ntsync_unlock_obj(dev, mutex, all);
+
+ return ret;
+}
+
static int ntsync_obj_release(struct inode *inode, struct file *file)
{
struct ntsync_obj *obj = file->private_data;
@@ -468,6 +522,8 @@ static long ntsync_obj_ioctl(struct file
return ntsync_sem_post(obj, argp);
case NTSYNC_IOC_MUTEX_UNLOCK:
return ntsync_mutex_unlock(obj, argp);
+ case NTSYNC_IOC_MUTEX_KILL:
+ return ntsync_mutex_kill(obj, argp);
default:
return -ENOIOCTLCMD;
}
@@ -659,6 +715,7 @@ static int setup_wait(struct ntsync_devi
q->owner = args->owner;
atomic_set(&q->signaled, -1);
q->all = all;
+ q->ownerdead = false;
q->count = count;
for (i = 0; i < count; i++) {
@@ -767,7 +824,7 @@ static int ntsync_wait_any(struct ntsync
struct ntsync_wait_args __user *user_args = argp;
/* even if we caught a signal, we need to communicate success */
- ret = 0;
+ ret = q->ownerdead ? -EOWNERDEAD : 0;
if (put_user(signaled, &user_args->index))
ret = -EFAULT;
@@ -848,7 +905,7 @@ static int ntsync_wait_all(struct ntsync
struct ntsync_wait_args __user *user_args = argp;
/* even if we caught a signal, we need to communicate success */
- ret = 0;
+ ret = q->ownerdead ? -EOWNERDEAD : 0;
if (put_user(signaled, &user_args->index))
ret = -EFAULT;
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -43,5 +43,6 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32)
#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args)
+#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32)
#endif

View File

@ -0,0 +1,166 @@
From 3f3bbc85f1e613364261d685b8197c32ffdeaad0 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:32 -0500
Subject: ntsync: Introduce NTSYNC_IOC_CREATE_EVENT.
This correspond to the NT syscall NtCreateEvent().
An NT event holds a single bit of state denoting whether it is signaled or
unsignaled.
There are two types of events: manual-reset and automatic-reset. When an
automatic-reset event is acquired via a wait function, its state is reset to
unsignaled. Manual-reset events are not affected by wait functions.
Whether the event is manual-reset, and its initial state, are specified at
creation time.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 62 +++++++++++++++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 7 +++++
2 files changed, 69 insertions(+)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -26,6 +26,7 @@
enum ntsync_type {
NTSYNC_TYPE_SEM,
NTSYNC_TYPE_MUTEX,
+ NTSYNC_TYPE_EVENT,
};
/*
@@ -61,6 +62,10 @@ struct ntsync_obj {
pid_t owner;
bool ownerdead;
} mutex;
+ struct {
+ bool manual;
+ bool signaled;
+ } event;
} u;
/*
@@ -233,6 +238,8 @@ static bool is_signaled(struct ntsync_ob
if (obj->u.mutex.owner && obj->u.mutex.owner != owner)
return false;
return obj->u.mutex.count < UINT_MAX;
+ case NTSYNC_TYPE_EVENT:
+ return obj->u.event.signaled;
}
WARN(1, "bad object type %#x\n", obj->type);
@@ -283,6 +290,10 @@ static void try_wake_all(struct ntsync_d
obj->u.mutex.count++;
obj->u.mutex.owner = q->owner;
break;
+ case NTSYNC_TYPE_EVENT:
+ if (!obj->u.event.manual)
+ obj->u.event.signaled = false;
+ break;
}
}
wake_up_process(q->task);
@@ -353,6 +364,28 @@ static void try_wake_any_mutex(struct nt
}
}
+static void try_wake_any_event(struct ntsync_obj *event)
+{
+ struct ntsync_q_entry *entry;
+
+ ntsync_assert_held(event);
+ lockdep_assert(event->type == NTSYNC_TYPE_EVENT);
+
+ list_for_each_entry(entry, &event->any_waiters, node) {
+ struct ntsync_q *q = entry->q;
+ int signaled = -1;
+
+ if (!event->u.event.signaled)
+ break;
+
+ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
+ if (!event->u.event.manual)
+ event->u.event.signaled = false;
+ wake_up_process(q->task);
+ }
+ }
+}
+
/*
* Actually change the semaphore state, returning -EOVERFLOW if it is made
* invalid.
@@ -629,6 +662,30 @@ static int ntsync_create_mutex(struct nt
return put_user(fd, &user_args->mutex);
}
+static int ntsync_create_event(struct ntsync_device *dev, void __user *argp)
+{
+ struct ntsync_event_args __user *user_args = argp;
+ struct ntsync_event_args args;
+ struct ntsync_obj *event;
+ int fd;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ event = ntsync_alloc_obj(dev, NTSYNC_TYPE_EVENT);
+ if (!event)
+ return -ENOMEM;
+ event->u.event.manual = args.manual;
+ event->u.event.signaled = args.signaled;
+ fd = ntsync_obj_get_fd(event);
+ if (fd < 0) {
+ kfree(event);
+ return fd;
+ }
+
+ return put_user(fd, &user_args->event);
+}
+
static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
{
struct file *file = fget(fd);
@@ -759,6 +816,9 @@ static void try_wake_any_obj(struct ntsy
case NTSYNC_TYPE_MUTEX:
try_wake_any_mutex(obj);
break;
+ case NTSYNC_TYPE_EVENT:
+ try_wake_any_event(obj);
+ break;
}
}
@@ -948,6 +1008,8 @@ static long ntsync_char_ioctl(struct fil
void __user *argp = (void __user *)parm;
switch (cmd) {
+ case NTSYNC_IOC_CREATE_EVENT:
+ return ntsync_create_event(dev, argp);
case NTSYNC_IOC_CREATE_MUTEX:
return ntsync_create_mutex(dev, argp);
case NTSYNC_IOC_CREATE_SEM:
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -22,6 +22,12 @@ struct ntsync_mutex_args {
__u32 count;
};
+struct ntsync_event_args {
+ __u32 event;
+ __u32 manual;
+ __u32 signaled;
+};
+
#define NTSYNC_WAIT_REALTIME 0x1
struct ntsync_wait_args {
@@ -40,6 +46,7 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args)
#define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args)
#define NTSYNC_IOC_CREATE_MUTEX _IOWR('N', 0x84, struct ntsync_sem_args)
+#define NTSYNC_IOC_CREATE_EVENT _IOWR('N', 0x87, struct ntsync_event_args)
#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32)
#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args)

View File

@ -0,0 +1,67 @@
From a6f107f17a976008b85c3e269bf4196e595d3f52 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:33 -0500
Subject: ntsync: Introduce NTSYNC_IOC_EVENT_SET.
This corresponds to the NT syscall NtSetEvent().
This sets the event to the signaled state, and returns its previous state.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 27 +++++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 1 +
2 files changed, 28 insertions(+)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -534,6 +534,31 @@ static int ntsync_mutex_kill(struct ntsy
return ret;
}
+static int ntsync_event_set(struct ntsync_obj *event, void __user *argp)
+{
+ struct ntsync_device *dev = event->dev;
+ __u32 prev_state;
+ bool all;
+
+ if (event->type != NTSYNC_TYPE_EVENT)
+ return -EINVAL;
+
+ all = ntsync_lock_obj(dev, event);
+
+ prev_state = event->u.event.signaled;
+ event->u.event.signaled = true;
+ if (all)
+ try_wake_all_obj(dev, event);
+ try_wake_any_event(event);
+
+ ntsync_unlock_obj(dev, event, all);
+
+ if (put_user(prev_state, (__u32 __user *)argp))
+ return -EFAULT;
+
+ return 0;
+}
+
static int ntsync_obj_release(struct inode *inode, struct file *file)
{
struct ntsync_obj *obj = file->private_data;
@@ -557,6 +582,8 @@ static long ntsync_obj_ioctl(struct file
return ntsync_mutex_unlock(obj, argp);
case NTSYNC_IOC_MUTEX_KILL:
return ntsync_mutex_kill(obj, argp);
+ case NTSYNC_IOC_EVENT_SET:
+ return ntsync_event_set(obj, argp);
default:
return -ENOIOCTLCMD;
}
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -51,5 +51,6 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32)
#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args)
#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32)
+#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32)
#endif

View File

@ -0,0 +1,64 @@
From aa3ebb5870eb9ed259aba2ed4e07e9993e6cd978 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:34 -0500
Subject: ntsync: Introduce NTSYNC_IOC_EVENT_RESET.
This corresponds to the NT syscall NtResetEvent().
This sets the event to the unsignaled state, and returns its previous state.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 24 ++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 1 +
2 files changed, 25 insertions(+)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -559,6 +559,28 @@ static int ntsync_event_set(struct ntsyn
return 0;
}
+static int ntsync_event_reset(struct ntsync_obj *event, void __user *argp)
+{
+ struct ntsync_device *dev = event->dev;
+ __u32 prev_state;
+ bool all;
+
+ if (event->type != NTSYNC_TYPE_EVENT)
+ return -EINVAL;
+
+ all = ntsync_lock_obj(dev, event);
+
+ prev_state = event->u.event.signaled;
+ event->u.event.signaled = false;
+
+ ntsync_unlock_obj(dev, event, all);
+
+ if (put_user(prev_state, (__u32 __user *)argp))
+ return -EFAULT;
+
+ return 0;
+}
+
static int ntsync_obj_release(struct inode *inode, struct file *file)
{
struct ntsync_obj *obj = file->private_data;
@@ -584,6 +606,8 @@ static long ntsync_obj_ioctl(struct file
return ntsync_mutex_kill(obj, argp);
case NTSYNC_IOC_EVENT_SET:
return ntsync_event_set(obj, argp);
+ case NTSYNC_IOC_EVENT_RESET:
+ return ntsync_event_reset(obj, argp);
default:
return -ENOIOCTLCMD;
}
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -52,5 +52,6 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args)
#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32)
#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32)
+#define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32)
#endif

View File

@ -0,0 +1,60 @@
From 99bca5d776a3011214041c42107a210fe315a35e Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:35 -0500
Subject: ntsync: Introduce NTSYNC_IOC_EVENT_PULSE.
This corresponds to the NT syscall NtPulseEvent().
This wakes up any waiters as if the event had been set, but does not set the
event, instead resetting it if it had been signalled. Thus, for a manual-reset
event, all waiters are woken, whereas for an auto-reset event, at most one
waiter is woken.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 8 ++++++--
include/uapi/linux/ntsync.h | 1 +
2 files changed, 7 insertions(+), 2 deletions(-)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -534,7 +534,7 @@ static int ntsync_mutex_kill(struct ntsy
return ret;
}
-static int ntsync_event_set(struct ntsync_obj *event, void __user *argp)
+static int ntsync_event_set(struct ntsync_obj *event, void __user *argp, bool pulse)
{
struct ntsync_device *dev = event->dev;
__u32 prev_state;
@@ -550,6 +550,8 @@ static int ntsync_event_set(struct ntsyn
if (all)
try_wake_all_obj(dev, event);
try_wake_any_event(event);
+ if (pulse)
+ event->u.event.signaled = false;
ntsync_unlock_obj(dev, event, all);
@@ -605,9 +607,11 @@ static long ntsync_obj_ioctl(struct file
case NTSYNC_IOC_MUTEX_KILL:
return ntsync_mutex_kill(obj, argp);
case NTSYNC_IOC_EVENT_SET:
- return ntsync_event_set(obj, argp);
+ return ntsync_event_set(obj, argp, false);
case NTSYNC_IOC_EVENT_RESET:
return ntsync_event_reset(obj, argp);
+ case NTSYNC_IOC_EVENT_PULSE:
+ return ntsync_event_set(obj, argp, true);
default:
return -ENOIOCTLCMD;
}
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -53,5 +53,6 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32)
#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32)
#define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32)
+#define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32)
#endif

View File

@ -0,0 +1,66 @@
From 1ef0ea672662bd19e7c6a4eac1067d11e50844b2 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:36 -0500
Subject: ntsync: Introduce NTSYNC_IOC_SEM_READ.
This corresponds to the NT syscall NtQuerySemaphore().
This returns the current count and maximum count of the semaphore.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 26 ++++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 1 +
2 files changed, 27 insertions(+)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -583,6 +583,30 @@ static int ntsync_event_reset(struct nts
return 0;
}
+static int ntsync_sem_read(struct ntsync_obj *sem, void __user *argp)
+{
+ struct ntsync_sem_args __user *user_args = argp;
+ struct ntsync_device *dev = sem->dev;
+ struct ntsync_sem_args args;
+ bool all;
+
+ if (sem->type != NTSYNC_TYPE_SEM)
+ return -EINVAL;
+
+ args.sem = 0;
+
+ all = ntsync_lock_obj(dev, sem);
+
+ args.count = sem->u.sem.count;
+ args.max = sem->u.sem.max;
+
+ ntsync_unlock_obj(dev, sem, all);
+
+ if (copy_to_user(user_args, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+}
+
static int ntsync_obj_release(struct inode *inode, struct file *file)
{
struct ntsync_obj *obj = file->private_data;
@@ -602,6 +626,8 @@ static long ntsync_obj_ioctl(struct file
switch (cmd) {
case NTSYNC_IOC_SEM_POST:
return ntsync_sem_post(obj, argp);
+ case NTSYNC_IOC_SEM_READ:
+ return ntsync_sem_read(obj, argp);
case NTSYNC_IOC_MUTEX_UNLOCK:
return ntsync_mutex_unlock(obj, argp);
case NTSYNC_IOC_MUTEX_KILL:
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -54,5 +54,6 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32)
#define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32)
#define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32)
+#define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args)
#endif

View File

@ -0,0 +1,68 @@
From 7891b7d15abd12975aebb955821fbc43353b45d6 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:37 -0500
Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_READ.
This corresponds to the NT syscall NtQueryMutant().
This returns the recursion count, owner, and abandoned state of the mutex.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 28 ++++++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 1 +
2 files changed, 29 insertions(+)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -607,6 +607,32 @@ static int ntsync_sem_read(struct ntsync
return 0;
}
+static int ntsync_mutex_read(struct ntsync_obj *mutex, void __user *argp)
+{
+ struct ntsync_mutex_args __user *user_args = argp;
+ struct ntsync_device *dev = mutex->dev;
+ struct ntsync_mutex_args args;
+ bool all;
+ int ret;
+
+ if (mutex->type != NTSYNC_TYPE_MUTEX)
+ return -EINVAL;
+
+ args.mutex = 0;
+
+ all = ntsync_lock_obj(dev, mutex);
+
+ args.count = mutex->u.mutex.count;
+ args.owner = mutex->u.mutex.owner;
+ ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0;
+
+ ntsync_unlock_obj(dev, mutex, all);
+
+ if (copy_to_user(user_args, &args, sizeof(args)))
+ return -EFAULT;
+ return ret;
+}
+
static int ntsync_obj_release(struct inode *inode, struct file *file)
{
struct ntsync_obj *obj = file->private_data;
@@ -632,6 +658,8 @@ static long ntsync_obj_ioctl(struct file
return ntsync_mutex_unlock(obj, argp);
case NTSYNC_IOC_MUTEX_KILL:
return ntsync_mutex_kill(obj, argp);
+ case NTSYNC_IOC_MUTEX_READ:
+ return ntsync_mutex_read(obj, argp);
case NTSYNC_IOC_EVENT_SET:
return ntsync_event_set(obj, argp, false);
case NTSYNC_IOC_EVENT_RESET:
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -55,5 +55,6 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32)
#define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32)
#define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args)
+#define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args)
#endif

View File

@ -0,0 +1,66 @@
From 35ff252f99aa4002e0c2ecef37314a422969791b Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:38 -0500
Subject: ntsync: Introduce NTSYNC_IOC_EVENT_READ.
This corresponds to the NT syscall NtQueryEvent().
This returns the signaled state of the event and whether it is manual-reset.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 26 ++++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 1 +
2 files changed, 27 insertions(+)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -633,6 +633,30 @@ static int ntsync_mutex_read(struct ntsy
return ret;
}
+static int ntsync_event_read(struct ntsync_obj *event, void __user *argp)
+{
+ struct ntsync_event_args __user *user_args = argp;
+ struct ntsync_device *dev = event->dev;
+ struct ntsync_event_args args;
+ bool all;
+
+ if (event->type != NTSYNC_TYPE_EVENT)
+ return -EINVAL;
+
+ args.event = 0;
+
+ all = ntsync_lock_obj(dev, event);
+
+ args.manual = event->u.event.manual;
+ args.signaled = event->u.event.signaled;
+
+ ntsync_unlock_obj(dev, event, all);
+
+ if (copy_to_user(user_args, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+}
+
static int ntsync_obj_release(struct inode *inode, struct file *file)
{
struct ntsync_obj *obj = file->private_data;
@@ -666,6 +690,8 @@ static long ntsync_obj_ioctl(struct file
return ntsync_event_reset(obj, argp);
case NTSYNC_IOC_EVENT_PULSE:
return ntsync_event_set(obj, argp, true);
+ case NTSYNC_IOC_EVENT_READ:
+ return ntsync_event_read(obj, argp);
default:
return -ENOIOCTLCMD;
}
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -56,5 +56,6 @@ struct ntsync_wait_args {
#define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32)
#define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args)
#define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args)
+#define NTSYNC_IOC_EVENT_READ _IOR ('N', 0x8d, struct ntsync_event_args)
#endif

View File

@ -0,0 +1,187 @@
From 2c391d57d1393cd46bf8bab08232ddc3dd32d5e5 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:39 -0500
Subject: ntsync: Introduce alertable waits.
NT waits can optionally be made "alertable". This is a special channel for
thread wakeup that is mildly similar to SIGIO. A thread has an internal single
bit of "alerted" state, and if a thread is alerted while an alertable wait, the
wait will return a special value, consume the "alerted" state, and will not
consume any of its objects.
Alerts are implemented using events; the user-space NT emulator is expected to
create an internal ntsync event for each thread and pass that event to wait
functions.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 70 ++++++++++++++++++++++++++++++++-----
include/uapi/linux/ntsync.h | 3 +-
2 files changed, 63 insertions(+), 10 deletions(-)
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -885,22 +885,29 @@ static int setup_wait(struct ntsync_devi
const struct ntsync_wait_args *args, bool all,
struct ntsync_q **ret_q)
{
+ int fds[NTSYNC_MAX_WAIT_COUNT + 1];
const __u32 count = args->count;
- int fds[NTSYNC_MAX_WAIT_COUNT];
struct ntsync_q *q;
+ __u32 total_count;
__u32 i, j;
- if (args->pad[0] || args->pad[1] || (args->flags & ~NTSYNC_WAIT_REALTIME))
+ if (args->pad || (args->flags & ~NTSYNC_WAIT_REALTIME))
return -EINVAL;
if (args->count > NTSYNC_MAX_WAIT_COUNT)
return -EINVAL;
+ total_count = count;
+ if (args->alert)
+ total_count++;
+
if (copy_from_user(fds, u64_to_user_ptr(args->objs),
array_size(count, sizeof(*fds))))
return -EFAULT;
+ if (args->alert)
+ fds[count] = args->alert;
- q = kmalloc(struct_size(q, entries, count), GFP_KERNEL);
+ q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL);
if (!q)
return -ENOMEM;
q->task = current;
@@ -910,7 +917,7 @@ static int setup_wait(struct ntsync_devi
q->ownerdead = false;
q->count = count;
- for (i = 0; i < count; i++) {
+ for (i = 0; i < total_count; i++) {
struct ntsync_q_entry *entry = &q->entries[i];
struct ntsync_obj *obj = get_obj(dev, fds[i]);
@@ -960,10 +967,10 @@ static void try_wake_any_obj(struct ntsy
static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp)
{
struct ntsync_wait_args args;
+ __u32 i, total_count;
struct ntsync_q *q;
int signaled;
bool all;
- __u32 i;
int ret;
if (copy_from_user(&args, argp, sizeof(args)))
@@ -973,9 +980,13 @@ static int ntsync_wait_any(struct ntsync
if (ret < 0)
return ret;
+ total_count = args.count;
+ if (args.alert)
+ total_count++;
+
/* queue ourselves */
- for (i = 0; i < args.count; i++) {
+ for (i = 0; i < total_count; i++) {
struct ntsync_q_entry *entry = &q->entries[i];
struct ntsync_obj *obj = entry->obj;
@@ -984,9 +995,15 @@ static int ntsync_wait_any(struct ntsync
ntsync_unlock_obj(dev, obj, all);
}
- /* check if we are already signaled */
+ /*
+ * Check if we are already signaled.
+ *
+ * Note that the API requires that normal objects are checked before
+ * the alert event. Hence we queue the alert event last, and check
+ * objects in order.
+ */
- for (i = 0; i < args.count; i++) {
+ for (i = 0; i < total_count; i++) {
struct ntsync_obj *obj = q->entries[i].obj;
if (atomic_read(&q->signaled) != -1)
@@ -1003,7 +1020,7 @@ static int ntsync_wait_any(struct ntsync
/* and finally, unqueue */
- for (i = 0; i < args.count; i++) {
+ for (i = 0; i < total_count; i++) {
struct ntsync_q_entry *entry = &q->entries[i];
struct ntsync_obj *obj = entry->obj;
@@ -1063,6 +1080,14 @@ static int ntsync_wait_all(struct ntsync
*/
list_add_tail(&entry->node, &obj->all_waiters);
}
+ if (args.alert) {
+ struct ntsync_q_entry *entry = &q->entries[args.count];
+ struct ntsync_obj *obj = entry->obj;
+
+ dev_lock_obj(dev, obj);
+ list_add_tail(&entry->node, &obj->any_waiters);
+ dev_unlock_obj(dev, obj);
+ }
/* check if we are already signaled */
@@ -1070,6 +1095,21 @@ static int ntsync_wait_all(struct ntsync
mutex_unlock(&dev->wait_all_lock);
+ /*
+ * Check if the alert event is signaled, making sure to do so only
+ * after checking if the other objects are signaled.
+ */
+
+ if (args.alert) {
+ struct ntsync_obj *obj = q->entries[args.count].obj;
+
+ if (atomic_read(&q->signaled) == -1) {
+ bool all = ntsync_lock_obj(dev, obj);
+ try_wake_any_obj(obj);
+ ntsync_unlock_obj(dev, obj, all);
+ }
+ }
+
/* sleep */
ret = ntsync_schedule(q, &args);
@@ -1095,6 +1135,18 @@ static int ntsync_wait_all(struct ntsync
mutex_unlock(&dev->wait_all_lock);
+ if (args.alert) {
+ struct ntsync_q_entry *entry = &q->entries[args.count];
+ struct ntsync_obj *obj = entry->obj;
+ bool all;
+
+ all = ntsync_lock_obj(dev, obj);
+ list_del(&entry->node);
+ ntsync_unlock_obj(dev, obj, all);
+
+ put_obj(obj);
+ }
+
signaled = atomic_read(&q->signaled);
if (signaled != -1) {
struct ntsync_wait_args __user *user_args = argp;
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -37,7 +37,8 @@ struct ntsync_wait_args {
__u32 index;
__u32 flags;
__u32 owner;
- __u32 pad[2];
+ __u32 alert;
+ __u32 pad;
};
#define NTSYNC_MAX_WAIT_COUNT 64

View File

@ -0,0 +1,30 @@
From 8d87043cd76368bb9996ba541d12e40cbb4201e5 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:52 -0500
Subject: maintainers: Add an entry for ntsync.
Add myself as maintainer, supported by CodeWeavers.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
MAINTAINERS | 9 +++++++++
1 file changed, 9 insertions(+)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16319,6 +16319,15 @@ T: git https://github.com/Paragon-Softwa
F: Documentation/filesystems/ntfs3.rst
F: fs/ntfs3/
+NTSYNC SYNCHRONIZATION PRIMITIVE DRIVER
+M: Elizabeth Figura <zfigura@codeweavers.com>
+L: wine-devel@winehq.org
+S: Supported
+F: Documentation/userspace-api/ntsync.rst
+F: drivers/misc/ntsync.c
+F: include/uapi/linux/ntsync.h
+F: tools/testing/selftests/drivers/ntsync/
+
NUBUS SUBSYSTEM
M: Finn Thain <fthain@linux-m68k.org>
L: linux-m68k@lists.linux-m68k.org

View File

@ -0,0 +1,426 @@
From 4cb25d42d38f1e0b144b084674591b70afa60bb0 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Sun, 19 May 2024 15:24:53 -0500
Subject: docs: ntsync: Add documentation for the ntsync uAPI.
Add an overall explanation of the driver architecture, and complete and precise
specification for its intended behaviour.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
Documentation/userspace-api/index.rst | 1 +
Documentation/userspace-api/ntsync.rst | 398 +++++++++++++++++++++++++
2 files changed, 399 insertions(+)
create mode 100644 Documentation/userspace-api/ntsync.rst
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -63,6 +63,7 @@ Everything else
vduse
futex2
perf_ring_buffer
+ ntsync
.. only:: subproject and html
--- /dev/null
+++ b/Documentation/userspace-api/ntsync.rst
@@ -0,0 +1,398 @@
+===================================
+NT synchronization primitive driver
+===================================
+
+This page documents the user-space API for the ntsync driver.
+
+ntsync is a support driver for emulation of NT synchronization
+primitives by user-space NT emulators. It exists because implementation
+in user-space, using existing tools, cannot match Windows performance
+while offering accurate semantics. It is implemented entirely in
+software, and does not drive any hardware device.
+
+This interface is meant as a compatibility tool only, and should not
+be used for general synchronization. Instead use generic, versatile
+interfaces such as futex(2) and poll(2).
+
+Synchronization primitives
+==========================
+
+The ntsync driver exposes three types of synchronization primitives:
+semaphores, mutexes, and events.
+
+A semaphore holds a single volatile 32-bit counter, and a static 32-bit
+integer denoting the maximum value. It is considered signaled (that is,
+can be acquired without contention, or will wake up a waiting thread)
+when the counter is nonzero. The counter is decremented by one when a
+wait is satisfied. Both the initial and maximum count are established
+when the semaphore is created.
+
+A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit
+identifier denoting its owner. A mutex is considered signaled when its
+owner is zero (indicating that it is not owned). The recursion count is
+incremented when a wait is satisfied, and ownership is set to the given
+identifier.
+
+A mutex also holds an internal flag denoting whether its previous owner
+has died; such a mutex is said to be abandoned. Owner death is not
+tracked automatically based on thread death, but rather must be
+communicated using ``NTSYNC_IOC_MUTEX_KILL``. An abandoned mutex is
+inherently considered unowned.
+
+Except for the "unowned" semantics of zero, the actual value of the
+owner identifier is not interpreted by the ntsync driver at all. The
+intended use is to store a thread identifier; however, the ntsync
+driver does not actually validate that a calling thread provides
+consistent or unique identifiers.
+
+An event is similar to a semaphore with a maximum count of one. It holds
+a volatile boolean state denoting whether it is signaled or not. There
+are two types of events, auto-reset and manual-reset. An auto-reset
+event is designaled when a wait is satisfied; a manual-reset event is
+not. The event type is specified when the event is created.
+
+Unless specified otherwise, all operations on an object are atomic and
+totally ordered with respect to other operations on the same object.
+
+Objects are represented by files. When all file descriptors to an
+object are closed, that object is deleted.
+
+Char device
+===========
+
+The ntsync driver creates a single char device /dev/ntsync. Each file
+description opened on the device represents a unique instance intended
+to back an individual NT virtual machine. Objects created by one ntsync
+instance may only be used with other objects created by the same
+instance.
+
+ioctl reference
+===============
+
+All operations on the device are done through ioctls. There are four
+structures used in ioctl calls::
+
+ struct ntsync_sem_args {
+ __u32 sem;
+ __u32 count;
+ __u32 max;
+ };
+
+ struct ntsync_mutex_args {
+ __u32 mutex;
+ __u32 owner;
+ __u32 count;
+ };
+
+ struct ntsync_event_args {
+ __u32 event;
+ __u32 signaled;
+ __u32 manual;
+ };
+
+ struct ntsync_wait_args {
+ __u64 timeout;
+ __u64 objs;
+ __u32 count;
+ __u32 owner;
+ __u32 index;
+ __u32 alert;
+ __u32 flags;
+ __u32 pad;
+ };
+
+Depending on the ioctl, members of the structure may be used as input,
+output, or not at all. All ioctls return 0 on success.
+
+The ioctls on the device file are as follows:
+
+.. c:macro:: NTSYNC_IOC_CREATE_SEM
+
+ Create a semaphore object. Takes a pointer to struct
+ :c:type:`ntsync_sem_args`, which is used as follows:
+
+ .. list-table::
+
+ * - ``sem``
+ - On output, contains a file descriptor to the created semaphore.
+ * - ``count``
+ - Initial count of the semaphore.
+ * - ``max``
+ - Maximum count of the semaphore.
+
+ Fails with ``EINVAL`` if ``count`` is greater than ``max``.
+
+.. c:macro:: NTSYNC_IOC_CREATE_MUTEX
+
+ Create a mutex object. Takes a pointer to struct
+ :c:type:`ntsync_mutex_args`, which is used as follows:
+
+ .. list-table::
+
+ * - ``mutex``
+ - On output, contains a file descriptor to the created mutex.
+ * - ``count``
+ - Initial recursion count of the mutex.
+ * - ``owner``
+ - Initial owner of the mutex.
+
+ If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is
+ zero and ``count`` is nonzero, the function fails with ``EINVAL``.
+
+.. c:macro:: NTSYNC_IOC_CREATE_EVENT
+
+ Create an event object. Takes a pointer to struct
+ :c:type:`ntsync_event_args`, which is used as follows:
+
+ .. list-table::
+
+ * - ``event``
+ - On output, contains a file descriptor to the created event.
+ * - ``signaled``
+ - If nonzero, the event is initially signaled, otherwise
+ nonsignaled.
+ * - ``manual``
+ - If nonzero, the event is a manual-reset event, otherwise
+ auto-reset.
+
+The ioctls on the individual objects are as follows:
+
+.. c:macro:: NTSYNC_IOC_SEM_POST
+
+ Post to a semaphore object. Takes a pointer to a 32-bit integer,
+ which on input holds the count to be added to the semaphore, and on
+ output contains its previous count.
+
+ If adding to the semaphore's current count would raise the latter
+ past the semaphore's maximum count, the ioctl fails with
+ ``EOVERFLOW`` and the semaphore is not affected. If raising the
+ semaphore's count causes it to become signaled, eligible threads
+ waiting on this semaphore will be woken and the semaphore's count
+ decremented appropriately.
+
+.. c:macro:: NTSYNC_IOC_MUTEX_UNLOCK
+
+ Release a mutex object. Takes a pointer to struct
+ :c:type:`ntsync_mutex_args`, which is used as follows:
+
+ .. list-table::
+
+ * - ``mutex``
+ - Ignored.
+ * - ``owner``
+ - Specifies the owner trying to release this mutex.
+ * - ``count``
+ - On output, contains the previous recursion count.
+
+ If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner``
+ is not the current owner of the mutex, the ioctl fails with
+ ``EPERM``.
+
+ The mutex's count will be decremented by one. If decrementing the
+ mutex's count causes it to become zero, the mutex is marked as
+ unowned and signaled, and eligible threads waiting on it will be
+ woken as appropriate.
+
+.. c:macro:: NTSYNC_IOC_SET_EVENT
+
+ Signal an event object. Takes a pointer to a 32-bit integer, which on
+ output contains the previous state of the event.
+
+ Eligible threads will be woken, and auto-reset events will be
+ designaled appropriately.
+
+.. c:macro:: NTSYNC_IOC_RESET_EVENT
+
+ Designal an event object. Takes a pointer to a 32-bit integer, which
+ on output contains the previous state of the event.
+
+.. c:macro:: NTSYNC_IOC_PULSE_EVENT
+
+ Wake threads waiting on an event object while leaving it in an
+ unsignaled state. Takes a pointer to a 32-bit integer, which on
+ output contains the previous state of the event.
+
+ A pulse operation can be thought of as a set followed by a reset,
+ performed as a single atomic operation. If two threads are waiting on
+ an auto-reset event which is pulsed, only one will be woken. If two
+ threads are waiting a manual-reset event which is pulsed, both will
+ be woken. However, in both cases, the event will be unsignaled
+ afterwards, and a simultaneous read operation will always report the
+ event as unsignaled.
+
+.. c:macro:: NTSYNC_IOC_READ_SEM
+
+ Read the current state of a semaphore object. Takes a pointer to
+ struct :c:type:`ntsync_sem_args`, which is used as follows:
+
+ .. list-table::
+
+ * - ``sem``
+ - Ignored.
+ * - ``count``
+ - On output, contains the current count of the semaphore.
+ * - ``max``
+ - On output, contains the maximum count of the semaphore.
+
+.. c:macro:: NTSYNC_IOC_READ_MUTEX
+
+ Read the current state of a mutex object. Takes a pointer to struct
+ :c:type:`ntsync_mutex_args`, which is used as follows:
+
+ .. list-table::
+
+ * - ``mutex``
+ - Ignored.
+ * - ``owner``
+ - On output, contains the current owner of the mutex, or zero
+ if the mutex is not currently owned.
+ * - ``count``
+ - On output, contains the current recursion count of the mutex.
+
+ If the mutex is marked as abandoned, the function fails with
+ ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to
+ zero.
+
+.. c:macro:: NTSYNC_IOC_READ_EVENT
+
+ Read the current state of an event object. Takes a pointer to struct
+ :c:type:`ntsync_event_args`, which is used as follows:
+
+ .. list-table::
+
+ * - ``event``
+ - Ignored.
+ * - ``signaled``
+ - On output, contains the current state of the event.
+ * - ``manual``
+ - On output, contains 1 if the event is a manual-reset event,
+ and 0 otherwise.
+
+.. c:macro:: NTSYNC_IOC_KILL_OWNER
+
+ Mark a mutex as unowned and abandoned if it is owned by the given
+ owner. Takes an input-only pointer to a 32-bit integer denoting the
+ owner. If the owner is zero, the ioctl fails with ``EINVAL``. If the
+ owner does not own the mutex, the function fails with ``EPERM``.
+
+ Eligible threads waiting on the mutex will be woken as appropriate
+ (and such waits will fail with ``EOWNERDEAD``, as described below).
+
+.. c:macro:: NTSYNC_IOC_WAIT_ANY
+
+ Poll on any of a list of objects, atomically acquiring at most one.
+ Takes a pointer to struct :c:type:`ntsync_wait_args`, which is
+ used as follows:
+
+ .. list-table::
+
+ * - ``timeout``
+ - Absolute timeout in nanoseconds. If ``NTSYNC_WAIT_REALTIME``
+ is set, the timeout is measured against the REALTIME clock;
+ otherwise it is measured against the MONOTONIC clock. If the
+ timeout is equal to or earlier than the current time, the
+ function returns immediately without sleeping. If ``timeout``
+ is U64_MAX, the function will sleep until an object is
+ signaled, and will not fail with ``ETIMEDOUT``.
+ * - ``objs``
+ - Pointer to an array of ``count`` file descriptors
+ (specified as an integer so that the structure has the same
+ size regardless of architecture). If any object is
+ invalid, the function fails with ``EINVAL``.
+ * - ``count``
+ - Number of objects specified in the ``objs`` array.
+ If greater than ``NTSYNC_MAX_WAIT_COUNT``, the function fails
+ with ``EINVAL``.
+ * - ``owner``
+ - Mutex owner identifier. If any object in ``objs`` is a mutex,
+ the ioctl will attempt to acquire that mutex on behalf of
+ ``owner``. If ``owner`` is zero, the ioctl fails with
+ ``EINVAL``.
+ * - ``index``
+ - On success, contains the index (into ``objs``) of the object
+ which was signaled. If ``alert`` was signaled instead,
+ this contains ``count``.
+ * - ``alert``
+ - Optional event object file descriptor. If nonzero, this
+ specifies an "alert" event object which, if signaled, will
+ terminate the wait. If nonzero, the identifier must point to a
+ valid event.
+ * - ``flags``
+ - Zero or more flags. Currently the only flag is
+ ``NTSYNC_WAIT_REALTIME``, which causes the timeout to be
+ measured against the REALTIME clock instead of MONOTONIC.
+ * - ``pad``
+ - Unused, must be set to zero.
+
+ This function attempts to acquire one of the given objects. If unable
+ to do so, it sleeps until an object becomes signaled, subsequently
+ acquiring it, or the timeout expires. In the latter case the ioctl
+ fails with ``ETIMEDOUT``. The function only acquires one object, even
+ if multiple objects are signaled.
+
+ A semaphore is considered to be signaled if its count is nonzero, and
+ is acquired by decrementing its count by one. A mutex is considered
+ to be signaled if it is unowned or if its owner matches the ``owner``
+ argument, and is acquired by incrementing its recursion count by one
+ and setting its owner to the ``owner`` argument. An auto-reset event
+ is acquired by designaling it; a manual-reset event is not affected
+ by acquisition.
+
+ Acquisition is atomic and totally ordered with respect to other
+ operations on the same object. If two wait operations (with different
+ ``owner`` identifiers) are queued on the same mutex, only one is
+ signaled. If two wait operations are queued on the same semaphore,
+ and a value of one is posted to it, only one is signaled.
+
+ If an abandoned mutex is acquired, the ioctl fails with
+ ``EOWNERDEAD``. Although this is a failure return, the function may
+ otherwise be considered successful. The mutex is marked as owned by
+ the given owner (with a recursion count of 1) and as no longer
+ abandoned, and ``index`` is still set to the index of the mutex.
+
+ The ``alert`` argument is an "extra" event which can terminate the
+ wait, independently of all other objects.
+
+ It is valid to pass the same object more than once, including by
+ passing the same event in the ``objs`` array and in ``alert``. If a
+ wakeup occurs due to that object being signaled, ``index`` is set to
+ the lowest index corresponding to that object.
+
+ The function may fail with ``EINTR`` if a signal is received.
+
+.. c:macro:: NTSYNC_IOC_WAIT_ALL
+
+ Poll on a list of objects, atomically acquiring all of them. Takes a
+ pointer to struct :c:type:`ntsync_wait_args`, which is used
+ identically to ``NTSYNC_IOC_WAIT_ANY``, except that ``index`` is
+ always filled with zero on success if not woken via alert.
+
+ This function attempts to simultaneously acquire all of the given
+ objects. If unable to do so, it sleeps until all objects become
+ simultaneously signaled, subsequently acquiring them, or the timeout
+ expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and no
+ objects are modified.
+
+ Objects may become signaled and subsequently designaled (through
+ acquisition by other threads) while this thread is sleeping. Only
+ once all objects are simultaneously signaled does the ioctl acquire
+ them and return. The entire acquisition is atomic and totally ordered
+ with respect to other operations on any of the given objects.
+
+ If an abandoned mutex is acquired, the ioctl fails with
+ ``EOWNERDEAD``. Similarly to ``NTSYNC_IOC_WAIT_ANY``, all objects are
+ nevertheless marked as acquired. Note that if multiple mutex objects
+ are specified, there is no way to know which were marked as
+ abandoned.
+
+ As with "any" waits, the ``alert`` argument is an "extra" event which
+ can terminate the wait. Critically, however, an "all" wait will
+ succeed if all members in ``objs`` are signaled, *or* if ``alert`` is
+ signaled. In the latter case ``index`` will be set to ``count``. As
+ with "any" waits, if both conditions are filled, the former takes
+ priority, and objects in ``objs`` will be acquired.
+
+ Unlike ``NTSYNC_IOC_WAIT_ANY``, it is not valid to pass the same
+ object more than once, nor is it valid to pass the same object in
+ ``objs`` and in ``alert``. If this is attempted, the function fails
+ with ``EINVAL``.

View File

@ -0,0 +1,21 @@
From 06bc88f16094c6f38e0890992af4a32415716c5d Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Thu, 18 Jul 2024 21:19:39 +0200
Subject: Revert "misc: ntsync: mark driver as "broken" to prevent from
building"
This reverts commit f5b335dc025cfee90957efa90dc72fada0d5abb4.
---
drivers/misc/Kconfig | 1 -
1 file changed, 1 deletion(-)
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -507,7 +507,6 @@ config OPEN_DICE
config NTSYNC
tristate "NT synchronization primitive emulation"
- depends on BROKEN
help
This module provides kernel support for emulation of Windows NT
synchronization primitives. It is not a hardware driver.

View File

@ -0,0 +1,32 @@
From 1fd57f0a5e998d99035ecb3b110cbdb588403c83 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Sat, 5 Dec 2015 15:07:03 +0100
Subject: [PATCH] mac80211: ignore AP power level when tx power type is "fixed"
In some cases a user might want to connect to a far away access point,
which announces a low tx power limit. Using the AP's power limit can
make the connection significantly more unstable or even impossible, and
mac80211 currently provides no way to disable this behavior.
To fix this, use the currently unused distinction between limited and
fixed tx power to decide whether a remote AP's power limit should be
accepted.
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
net/mac80211/iface.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -62,7 +62,8 @@ bool __ieee80211_recalc_txpower(struct i
if (sdata->deflink.user_power_level != IEEE80211_UNSET_POWER_LEVEL)
power = min(power, sdata->deflink.user_power_level);
- if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL)
+ if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL &&
+ sdata->vif.bss_conf.txpower_type != NL80211_TX_POWER_FIXED)
power = min(power, sdata->deflink.ap_power_level);
if (power != sdata->vif.bss_conf.txpower) {

View File

@ -0,0 +1,24 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 23 Apr 2024 12:35:21 +0200
Subject: [PATCH] net: enable fraglist GRO by default
This can significantly improve performance for packet forwarding/bridging
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -242,10 +242,10 @@ static inline int find_next_netdev_featu
#define NETIF_F_UPPER_DISABLES NETIF_F_LRO
/* changeable features with no special hardware requirements */
-#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO)
+#define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO | NETIF_F_GRO_FRAGLIST)
/* Changeable features with no special hardware requirements that defaults to off. */
-#define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)
+#define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_UDP_FWD)
#define NETIF_F_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \
NETIF_F_HW_VLAN_CTAG_RX | \

View File

@ -0,0 +1,129 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 15 Aug 2024 21:15:13 +0200
Subject: [PATCH] net: remove NETIF_F_GSO_FRAGLIST from NETIF_F_GSO_SOFTWARE
Several drivers set NETIF_F_GSO_SOFTWARE, but mangle fraglist GRO packets
in a way that they can't be properly segmented anymore.
In order to properly deal with this, remove fraglist GSO from
NETIF_F_GSO_SOFTWARE and switch to NETIF_F_GSO_SOFTWARE_ALL (which includes
fraglist GSO) in places where it's safe to add.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -110,7 +110,7 @@ static void dummy_setup(struct net_devic
dev->flags &= ~IFF_MULTICAST;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST;
- dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->features |= NETIF_F_GSO_SOFTWARE_ALL;
dev->features |= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX;
dev->features |= NETIF_F_GSO_ENCAP_ALL;
dev->hw_features |= dev->features;
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -172,7 +172,7 @@ static void gen_lo_setup(struct net_devi
dev->flags = IFF_LOOPBACK;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
netif_keep_dst(dev);
- dev->hw_features = NETIF_F_GSO_SOFTWARE;
+ dev->hw_features = NETIF_F_GSO_SOFTWARE_ALL;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
| NETIF_F_GSO_SOFTWARE
| NETIF_F_HW_CSUM
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -897,7 +897,7 @@ static int macvlan_hwtstamp_set(struct n
static struct lock_class_key macvlan_netdev_addr_lock_key;
#define ALWAYS_ON_OFFLOADS \
- (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \
+ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE_ALL | \
NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL)
#define ALWAYS_ON_FEATURES (ALWAYS_ON_OFFLOADS | NETIF_F_LLTX)
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -219,13 +219,14 @@ static inline int find_next_netdev_featu
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP | \
- NETIF_F_GSO_UDP_L4 | NETIF_F_GSO_FRAGLIST)
+ NETIF_F_GSO_UDP_L4)
+#define NETIF_F_GSO_SOFTWARE_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_FRAGLIST)
/*
* If one device supports one of these features, then enable them
* for all in netdev_increment_features.
*/
-#define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
+#define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE_ALL | NETIF_F_GSO_ROBUST | \
NETIF_F_SG | NETIF_F_HIGHDMA | \
NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED)
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -108,7 +108,7 @@ static inline netdev_features_t vlan_tnl
netdev_features_t ret;
ret = real_dev->hw_enc_features &
- (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE |
+ (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE_ALL |
NETIF_F_GSO_ENCAP_ALL);
if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -561,7 +561,7 @@ static int vlan_dev_init(struct net_devi
dev->state |= (1 << __LINK_STATE_NOCARRIER);
dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
- NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
+ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE_ALL |
NETIF_F_GSO_ENCAP_ALL |
NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
NETIF_F_ALL_FCOE;
@@ -654,7 +654,7 @@ static netdev_features_t vlan_dev_fix_fe
if (lower_features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
lower_features |= NETIF_F_HW_CSUM;
features = netdev_intersect_features(features, lower_features);
- features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE);
+ features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE_ALL);
features |= NETIF_F_LLTX;
return features;
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2451,7 +2451,7 @@ void sk_setup_caps(struct sock *sk, stru
if (sk_is_tcp(sk))
sk->sk_route_caps |= NETIF_F_GSO;
if (sk->sk_route_caps & NETIF_F_GSO)
- sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+ sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE_ALL;
if (unlikely(sk->sk_gso_disabled))
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
if (sk_can_gso(sk)) {
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2009,7 +2009,7 @@ void ieee80211_color_collision_detection
/* interface handling */
#define MAC80211_SUPPORTED_FEATURES_TX (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
NETIF_F_HW_CSUM | NETIF_F_SG | \
- NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE_ALL | \
NETIF_F_HW_TC)
#define MAC80211_SUPPORTED_FEATURES_RX (NETIF_F_RXCSUM)
#define MAC80211_SUPPORTED_FEATURES (MAC80211_SUPPORTED_FEATURES_TX | \
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -109,7 +109,7 @@ static void do_setup(struct net_device *
netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
- NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL;
+ NETIF_F_GSO_SOFTWARE_ALL | NETIF_F_GSO_ENCAP_ALL;
netdev->vlan_features = netdev->features;
netdev->hw_enc_features = netdev->features;

View File

@ -0,0 +1,762 @@
From a657df31affbb91d8cb2718e70f42cf8ed6e9a7c Mon Sep 17 00:00:00 2001
From: graysky <therealgraysky AT proton DOT me>
Date: Mon, 16 Sep 2024 05:55:58 -0400
Subject: ZEN: Add graysky's more-ISA-levels-and-uarches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
From https://github.com/graysky2/kernel_compiler_patch
more-ISA-levels-and-uarches-for-kernel-6.8-rc4+.patch
FEATURES
This patch adds additional tunings via new x86-64 ISA levels and
more micro-architecture options to the Linux kernel in three classes.
1. New generic x86-64 ISA levels
These are selectable under:
Processor type and features ---> x86-64 compiler ISA level
• x86-64 A value of (1) is the default
• x86-64-v2 A value of (2) brings support for vector
instructions up to Streaming SIMD Extensions 4.2 (SSE4.2)
and Supplemental Streaming SIMD Extensions 3 (SSSE3), the
POPCNT instruction, and CMPXCHG16B.
• x86-64-v3 A value of (3) adds vector instructions up to AVX2, MOVBE,
and additional bit-manipulation instructions.
There is also x86-64-v4 but including this makes little sense as
the kernel does not use any of the AVX512 instructions anyway.
Users of glibc 2.33 and above can see which level is supported by running:
/lib/ld-linux-x86-64.so.2 --help | grep supported
Or
/lib64/ld-linux-x86-64.so.2 --help | grep supported
2. New micro-architectures
These are selectable under:
Processor type and features ---> Processor family
• AMD Improved K8-family
• AMD K10-family
• AMD Family 10h (Barcelona)
• AMD Family 14h (Bobcat)
• AMD Family 16h (Jaguar)
• AMD Family 15h (Bulldozer)
• AMD Family 15h (Piledriver)
• AMD Family 15h (Steamroller)
• AMD Family 15h (Excavator)
• AMD Family 17h (Zen)
• AMD Family 17h (Zen 2)
• AMD Family 19h (Zen 3)**
• AMD Family 19h (Zen 4)‡
• AMD Family 1Ah (Zen 5)§
• Intel Silvermont low-power processors
• Intel Goldmont low-power processors (Apollo Lake and Denverton)
• Intel Goldmont Plus low-power processors (Gemini Lake)
• Intel 1st Gen Core i3/i5/i7 (Nehalem)
• Intel 1.5 Gen Core i3/i5/i7 (Westmere)
• Intel 2nd Gen Core i3/i5/i7 (Sandybridge)
• Intel 3rd Gen Core i3/i5/i7 (Ivybridge)
• Intel 4th Gen Core i3/i5/i7 (Haswell)
• Intel 5th Gen Core i3/i5/i7 (Broadwell)
• Intel 6th Gen Core i3/i5/i7 (Skylake)
• Intel 6th Gen Core i7/i9 (Skylake X)
• Intel 8th Gen Core i3/i5/i7 (Cannon Lake)
• Intel 10th Gen Core i7/i9 (Ice Lake)
• Intel Xeon (Cascade Lake)
• Intel Xeon (Cooper Lake)*
• Intel 3rd Gen 10nm++ i3/i5/i7/i9-family (Tiger Lake)*
• Intel 4th Gen 10nm++ Xeon (Sapphire Rapids)†
• Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)†
• Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)†
• Intel 13th Gen i3/i5/i7/i9-family (Raptor Lake)‡
• Intel 14th Gen i3/i5/i7/i9-family (Meteor Lake)‡
• Intel 5th Gen 10nm++ Xeon (Emerald Rapids)‡
Notes: If not otherwise noted, gcc >=9.1 is required for support.
*Requires gcc >=10.1 or clang >=10.0
**Required gcc >=10.3 or clang >=12.0
†Required gcc >=11.1 or clang >=12.0
‡Required gcc >=13.0 or clang >=15.0.5
§Required gcc >14.0 or clang >=19.0?
3. Auto-detected micro-architecture levels
Compile by passing the '-march=native' option which, "selects the CPU
to generate code for at compilation time by determining the processor type of
the compiling machine. Using -march=native enables all instruction subsets
supported by the local machine and will produce code optimized for the local
machine under the constraints of the selected instruction set."[1]
Users of Intel CPUs should select the 'Intel-Native' option and users of AMD
CPUs should select the 'AMD-Native' option.
MINOR NOTES RELATING TO INTEL ATOM PROCESSORS
This patch also changes -march=atom to -march=bonnell in accordance with the
gcc v4.9 changes. Upstream is using the deprecated -match=atom flags when I
believe it should use the newer -march=bonnell flag for atom processors.[2]
It is not recommended to compile on Atom-CPUs with the 'native' option.[3] The
recommendation is to use the 'atom' option instead.
BENEFITS
Small but real speed increases are measurable using a make endpoint comparing
a generic kernel to one built with one of the respective microarchs.
See the following experimental evidence supporting this statement:
https://github.com/graysky2/kernel_compiler_patch?tab=readme-ov-file#benchmarks
REQUIREMENTS
linux version 6.8-rc3+
gcc version >=9.0 or clang version >=9.0
ACKNOWLEDGMENTS
This patch builds on the seminal work by Jeroen.[4]
REFERENCES
1. https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options
2. https://bugzilla.kernel.org/show_bug.cgi?id=77461
3. https://github.com/graysky2/kernel_gcc_patch/issues/15
4. http://www.linuxforge.net/docs/linux/linux-gcc.php
---
arch/x86/Kconfig.cpu | 363 ++++++++++++++++++++++++++++++--
arch/x86/Makefile | 89 +++++++-
arch/x86/include/asm/vermagic.h | 70 ++++++
3 files changed, 506 insertions(+), 16 deletions(-)
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -155,9 +155,8 @@ config MPENTIUM4
-Paxville
-Dempsey
-
config MK6
- bool "K6/K6-II/K6-III"
+ bool "AMD K6/K6-II/K6-III"
depends on X86_32
help
Select this for an AMD K6-family processor. Enables use of
@@ -165,7 +164,7 @@ config MK6
flags to GCC.
config MK7
- bool "Athlon/Duron/K7"
+ bool "AMD Athlon/Duron/K7"
depends on X86_32
help
Select this for an AMD Athlon K7-family processor. Enables use of
@@ -173,12 +172,114 @@ config MK7
flags to GCC.
config MK8
- bool "Opteron/Athlon64/Hammer/K8"
+ bool "AMD Opteron/Athlon64/Hammer/K8"
help
Select this for an AMD Opteron or Athlon64 Hammer-family processor.
Enables use of some extended instructions, and passes appropriate
optimization flags to GCC.
+config MK8SSE3
+ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
+ help
+ Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
+ Enables use of some extended instructions, and passes appropriate
+ optimization flags to GCC.
+
+config MK10
+ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
+ help
+ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
+ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
+ Enables use of some extended instructions, and passes appropriate
+ optimization flags to GCC.
+
+config MBARCELONA
+ bool "AMD Barcelona"
+ help
+ Select this for AMD Family 10h Barcelona processors.
+
+ Enables -march=barcelona
+
+config MBOBCAT
+ bool "AMD Bobcat"
+ help
+ Select this for AMD Family 14h Bobcat processors.
+
+ Enables -march=btver1
+
+config MJAGUAR
+ bool "AMD Jaguar"
+ help
+ Select this for AMD Family 16h Jaguar processors.
+
+ Enables -march=btver2
+
+config MBULLDOZER
+ bool "AMD Bulldozer"
+ help
+ Select this for AMD Family 15h Bulldozer processors.
+
+ Enables -march=bdver1
+
+config MPILEDRIVER
+ bool "AMD Piledriver"
+ help
+ Select this for AMD Family 15h Piledriver processors.
+
+ Enables -march=bdver2
+
+config MSTEAMROLLER
+ bool "AMD Steamroller"
+ help
+ Select this for AMD Family 15h Steamroller processors.
+
+ Enables -march=bdver3
+
+config MEXCAVATOR
+ bool "AMD Excavator"
+ help
+ Select this for AMD Family 15h Excavator processors.
+
+ Enables -march=bdver4
+
+config MZEN
+ bool "AMD Zen"
+ help
+ Select this for AMD Family 17h Zen processors.
+
+ Enables -march=znver1
+
+config MZEN2
+ bool "AMD Zen 2"
+ help
+ Select this for AMD Family 17h Zen 2 processors.
+
+ Enables -march=znver2
+
+config MZEN3
+ bool "AMD Zen 3"
+ depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+ help
+ Select this for AMD Family 19h Zen 3 processors.
+
+ Enables -march=znver3
+
+config MZEN4
+ bool "AMD Zen 4"
+ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 160000)
+ help
+ Select this for AMD Family 19h Zen 4 processors.
+
+ Enables -march=znver4
+
+config MZEN5
+ bool "AMD Zen 5"
+ depends on (CC_IS_GCC && GCC_VERSION > 140000) || (CC_IS_CLANG && CLANG_VERSION >= 191000)
+ help
+ Select this for AMD Family 19h Zen 5 processors.
+
+ Enables -march=znver5
+
config MCRUSOE
bool "Crusoe"
depends on X86_32
@@ -269,8 +370,17 @@ config MPSC
using the cpu family field
in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
+config MATOM
+ bool "Intel Atom"
+ help
+
+ Select this for the Intel Atom platform. Intel Atom CPUs have an
+ in-order pipelining architecture and thus can benefit from
+ accordingly optimized code. Use a recent GCC with specific Atom
+ support in order to fully benefit from selecting this option.
+
config MCORE2
- bool "Core 2/newer Xeon"
+ bool "Intel Core 2"
help
Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
@@ -278,14 +388,191 @@ config MCORE2
family in /proc/cpuinfo. Newer ones have 6 and older ones 15
(not a typo)
-config MATOM
- bool "Intel Atom"
+ Enables -march=core2
+
+config MNEHALEM
+ bool "Intel Nehalem"
help
- Select this for the Intel Atom platform. Intel Atom CPUs have an
- in-order pipelining architecture and thus can benefit from
- accordingly optimized code. Use a recent GCC with specific Atom
- support in order to fully benefit from selecting this option.
+ Select this for 1st Gen Core processors in the Nehalem family.
+
+ Enables -march=nehalem
+
+config MWESTMERE
+ bool "Intel Westmere"
+ help
+
+ Select this for the Intel Westmere formerly Nehalem-C family.
+
+ Enables -march=westmere
+
+config MSILVERMONT
+ bool "Intel Silvermont"
+ help
+
+ Select this for the Intel Silvermont platform.
+
+ Enables -march=silvermont
+
+config MGOLDMONT
+ bool "Intel Goldmont"
+ help
+
+ Select this for the Intel Goldmont platform including Apollo Lake and Denverton.
+
+ Enables -march=goldmont
+
+config MGOLDMONTPLUS
+ bool "Intel Goldmont Plus"
+ help
+
+ Select this for the Intel Goldmont Plus platform including Gemini Lake.
+
+ Enables -march=goldmont-plus
+
+config MSANDYBRIDGE
+ bool "Intel Sandy Bridge"
+ help
+
+ Select this for 2nd Gen Core processors in the Sandy Bridge family.
+
+ Enables -march=sandybridge
+
+config MIVYBRIDGE
+ bool "Intel Ivy Bridge"
+ help
+
+ Select this for 3rd Gen Core processors in the Ivy Bridge family.
+
+ Enables -march=ivybridge
+
+config MHASWELL
+ bool "Intel Haswell"
+ help
+
+ Select this for 4th Gen Core processors in the Haswell family.
+
+ Enables -march=haswell
+
+config MBROADWELL
+ bool "Intel Broadwell"
+ help
+
+ Select this for 5th Gen Core processors in the Broadwell family.
+
+ Enables -march=broadwell
+
+config MSKYLAKE
+ bool "Intel Skylake"
+ help
+
+ Select this for 6th Gen Core processors in the Skylake family.
+
+ Enables -march=skylake
+
+config MSKYLAKEX
+ bool "Intel Skylake X"
+ help
+
+ Select this for 6th Gen Core processors in the Skylake X family.
+
+ Enables -march=skylake-avx512
+
+config MCANNONLAKE
+ bool "Intel Cannon Lake"
+ help
+
+ Select this for 8th Gen Core processors
+
+ Enables -march=cannonlake
+
+config MICELAKE
+ bool "Intel Ice Lake"
+ help
+
+ Select this for 10th Gen Core processors in the Ice Lake family.
+
+ Enables -march=icelake-client
+
+config MCASCADELAKE
+ bool "Intel Cascade Lake"
+ help
+
+ Select this for Xeon processors in the Cascade Lake family.
+
+ Enables -march=cascadelake
+
+config MCOOPERLAKE
+ bool "Intel Cooper Lake"
+ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+ help
+
+ Select this for Xeon processors in the Cooper Lake family.
+
+ Enables -march=cooperlake
+
+config MTIGERLAKE
+ bool "Intel Tiger Lake"
+ depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+ help
+
+ Select this for third-generation 10 nm process processors in the Tiger Lake family.
+
+ Enables -march=tigerlake
+
+config MSAPPHIRERAPIDS
+ bool "Intel Sapphire Rapids"
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+ help
+
+ Select this for fourth-generation 10 nm process processors in the Sapphire Rapids family.
+
+ Enables -march=sapphirerapids
+
+config MROCKETLAKE
+ bool "Intel Rocket Lake"
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+ help
+
+ Select this for eleventh-generation processors in the Rocket Lake family.
+
+ Enables -march=rocketlake
+
+config MALDERLAKE
+ bool "Intel Alder Lake"
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+ help
+
+ Select this for twelfth-generation processors in the Alder Lake family.
+
+ Enables -march=alderlake
+
+config MRAPTORLAKE
+ bool "Intel Raptor Lake"
+ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+ help
+
+ Select this for thirteenth-generation processors in the Raptor Lake family.
+
+ Enables -march=raptorlake
+
+config MMETEORLAKE
+ bool "Intel Meteor Lake"
+ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+ help
+
+ Select this for fourteenth-generation processors in the Meteor Lake family.
+
+ Enables -march=meteorlake
+
+config MEMERALDRAPIDS
+ bool "Intel Emerald Rapids"
+ depends on (CC_IS_GCC && GCC_VERSION > 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+ help
+
+ Select this for fifth-generation 10 nm process processors in the Emerald Rapids family.
+
+ Enables -march=emeraldrapids
config GENERIC_CPU
bool "Generic-x86-64"
@@ -294,8 +581,32 @@ config GENERIC_CPU
Generic x86-64 CPU.
Run equally well on all x86-64 CPUs.
+config MNATIVE_INTEL
+ bool "Intel-Native optimizations autodetected by the compiler"
+ help
+
+ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+ the optimum settings to use based on your processor. Do NOT use this
+ for AMD CPUs. Intel Only!
+
+ Enables -march=native
+
+config MNATIVE_AMD
+ bool "AMD-Native optimizations autodetected by the compiler"
+ help
+
+ Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+ the optimum settings to use based on your processor. Do NOT use this
+ for Intel CPUs. AMD Only!
+
+ Enables -march=native
+
endchoice
+config SUPPORT_MARCH_CODEVERS
+ bool
+ default y if (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+
config X86_GENERIC
bool "Generic x86 support"
depends on X86_32
@@ -308,6 +619,30 @@ config X86_GENERIC
This is really intended for distributors who need more
generic optimizations.
+config X86_64_VERSION
+ int "x86-64 compiler ISA level"
+ range 1 3
+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+ depends on X86_64 && GENERIC_CPU
+ help
+ Specify a specific x86-64 compiler ISA level.
+
+ There are three x86-64 ISA levels that work on top of
+ the x86-64 baseline, namely: x86-64-v2, x86-64-v3, and x86-64-v4.
+
+ x86-64-v2 brings support for vector instructions up to Streaming SIMD
+ Extensions 4.2 (SSE4.2) and Supplemental Streaming SIMD Extensions 3
+ (SSSE3), the POPCNT instruction, and CMPXCHG16B.
+
+ x86-64-v3 adds vector instructions up to AVX2, MOVBE, and additional
+ bit-manipulation instructions.
+
+ x86-64-v4 is not included since the kernel does not use AVX512 instructions
+
+ You can find the best version for your CPU by running one of the following:
+ /lib/ld-linux-x86-64.so.2 --help | grep supported
+ /lib64/ld-linux-x86-64.so.2 --help | grep supported
+
#
# Define implied options from the CPU selection here
config X86_INTERNODE_CACHE_SHIFT
@@ -318,7 +653,7 @@ config X86_INTERNODE_CACHE_SHIFT
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
default "4" if MELAN || M486SX || M486 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
@@ -336,11 +671,11 @@ config X86_ALIGNMENT_16
config X86_INTEL_USERCOPY
def_bool y
- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL
config X86_USE_PPRO_CHECKSUM
def_bool y
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
#
# P6_NOPs are a relatively minor optimization that require a family >=
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -178,14 +178,99 @@ else
cflags-$(CONFIG_MPSC) += -march=nocona
cflags-$(CONFIG_MCORE2) += -march=core2
cflags-$(CONFIG_MATOM) += -march=atom
- cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic
+ ifeq ($(CONFIG_X86_64_VERSION),1)
+ cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic
+ rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic
+ else
+ cflags-$(CONFIG_GENERIC_CPU) += -march=x86-64-v$(CONFIG_X86_64_VERSION)
+ rustflags-$(CONFIG_GENERIC_CPU) += -Ctarget-cpu=x86-64-v$(CONFIG_X86_64_VERSION)
+ endif
+ cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3
+ cflags-$(CONFIG_MK10) += -march=amdfam10
+ cflags-$(CONFIG_MBARCELONA) += -march=barcelona
+ cflags-$(CONFIG_MBOBCAT) += -march=btver1
+ cflags-$(CONFIG_MJAGUAR) += -march=btver2
+ cflags-$(CONFIG_MBULLDOZER) += -march=bdver1
+ cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm
+ cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm
+ cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm
+ cflags-$(CONFIG_MZEN) += -march=znver1
+ cflags-$(CONFIG_MZEN2) += -march=znver2
+ cflags-$(CONFIG_MZEN3) += -march=znver3
+ cflags-$(CONFIG_MZEN4) += -march=znver4
+ cflags-$(CONFIG_MZEN5) += -march=znver5
+ cflags-$(CONFIG_MNATIVE_INTEL) += -march=native
+ cflags-$(CONFIG_MNATIVE_AMD) += -march=native -mno-tbm
+ cflags-$(CONFIG_MATOM) += -march=bonnell
+ cflags-$(CONFIG_MCORE2) += -march=core2
+ cflags-$(CONFIG_MNEHALEM) += -march=nehalem
+ cflags-$(CONFIG_MWESTMERE) += -march=westmere
+ cflags-$(CONFIG_MSILVERMONT) += -march=silvermont
+ cflags-$(CONFIG_MGOLDMONT) += -march=goldmont
+ cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus
+ cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge
+ cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge
+ cflags-$(CONFIG_MHASWELL) += -march=haswell
+ cflags-$(CONFIG_MBROADWELL) += -march=broadwell
+ cflags-$(CONFIG_MSKYLAKE) += -march=skylake
+ cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512
+ cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake
+ cflags-$(CONFIG_MICELAKE) += -march=icelake-client
+ cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake
+ cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake
+ cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake
+ cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids
+ cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake
+ cflags-$(CONFIG_MALDERLAKE) += -march=alderlake
+ cflags-$(CONFIG_MRAPTORLAKE) += -march=raptorlake
+ cflags-$(CONFIG_MMETEORLAKE) += -march=meteorlake
+ cflags-$(CONFIG_MEMERALDRAPIDS) += -march=emeraldrapids
KBUILD_CFLAGS += $(cflags-y)
rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8
rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona
rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2
rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom
- rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic
+ rustflags-$(CONFIG_MK8SSE3) += -Ctarget-cpu=k8-sse3
+ rustflags-$(CONFIG_MK10) += -Ctarget-cpu=amdfam10
+ rustflags-$(CONFIG_MBARCELONA) += -Ctarget-cpu=barcelona
+ rustflags-$(CONFIG_MBOBCAT) += -Ctarget-cpu=btver1
+ rustflags-$(CONFIG_MJAGUAR) += -Ctarget-cpu=btver2
+ rustflags-$(CONFIG_MBULLDOZER) += -Ctarget-cpu=bdver1
+ rustflags-$(CONFIG_MPILEDRIVER) += -Ctarget-cpu=bdver2
+ rustflags-$(CONFIG_MSTEAMROLLER) += -Ctarget-cpu=bdver3
+ rustflags-$(CONFIG_MEXCAVATOR) += -Ctarget-cpu=bdver4
+ rustflags-$(CONFIG_MZEN) += -Ctarget-cpu=znver1
+ rustflags-$(CONFIG_MZEN2) += -Ctarget-cpu=znver2
+ rustflags-$(CONFIG_MZEN3) += -Ctarget-cpu=znver3
+ rustflags-$(CONFIG_MZEN4) += -Ctarget-cpu=znver4
+ rustflags-$(CONFIG_MZEN5) += -Ctarget-cpu=znver5
+ rustflags-$(CONFIG_MNATIVE_INTEL) += -Ctarget-cpu=native
+ rustflags-$(CONFIG_MNATIVE_AMD) += -Ctarget-cpu=native
+ rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=bonnell
+ rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2
+ rustflags-$(CONFIG_MNEHALEM) += -Ctarget-cpu=nehalem
+ rustflags-$(CONFIG_MWESTMERE) += -Ctarget-cpu=westmere
+ rustflags-$(CONFIG_MSILVERMONT) += -Ctarget-cpu=silvermont
+ rustflags-$(CONFIG_MGOLDMONT) += -Ctarget-cpu=goldmont
+ rustflags-$(CONFIG_MGOLDMONTPLUS) += -Ctarget-cpu=goldmont-plus
+ rustflags-$(CONFIG_MSANDYBRIDGE) += -Ctarget-cpu=sandybridge
+ rustflags-$(CONFIG_MIVYBRIDGE) += -Ctarget-cpu=ivybridge
+ rustflags-$(CONFIG_MHASWELL) += -Ctarget-cpu=haswell
+ rustflags-$(CONFIG_MBROADWELL) += -Ctarget-cpu=broadwell
+ rustflags-$(CONFIG_MSKYLAKE) += -Ctarget-cpu=skylake
+ rustflags-$(CONFIG_MSKYLAKEX) += -Ctarget-cpu=skylake-avx512
+ rustflags-$(CONFIG_MCANNONLAKE) += -Ctarget-cpu=cannonlake
+ rustflags-$(CONFIG_MICELAKE) += -Ctarget-cpu=icelake-client
+ rustflags-$(CONFIG_MCASCADELAKE) += -Ctarget-cpu=cascadelake
+ rustflags-$(CONFIG_MCOOPERLAKE) += -Ctarget-cpu=cooperlake
+ rustflags-$(CONFIG_MTIGERLAKE) += -Ctarget-cpu=tigerlake
+ rustflags-$(CONFIG_MSAPPHIRERAPIDS) += -Ctarget-cpu=sapphirerapids
+ rustflags-$(CONFIG_MROCKETLAKE) += -Ctarget-cpu=rocketlake
+ rustflags-$(CONFIG_MALDERLAKE) += -Ctarget-cpu=alderlake
+ rustflags-$(CONFIG_MRAPTORLAKE) += -Ctarget-cpu=raptorlake
+ rustflags-$(CONFIG_MMETEORLAKE) += -Ctarget-cpu=meteorlake
+ rustflags-$(CONFIG_MEMERALDRAPIDS) += -Ctarget-cpu=emeraldrapids
KBUILD_RUSTFLAGS += $(rustflags-y)
KBUILD_CFLAGS += -mno-red-zone
--- a/arch/x86/include/asm/vermagic.h
+++ b/arch/x86/include/asm/vermagic.h
@@ -17,6 +17,54 @@
#define MODULE_PROC_FAMILY "586MMX "
#elif defined CONFIG_MCORE2
#define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MNATIVE_INTEL
+#define MODULE_PROC_FAMILY "NATIVE_INTEL "
+#elif defined CONFIG_MNATIVE_AMD
+#define MODULE_PROC_FAMILY "NATIVE_AMD "
+#elif defined CONFIG_MNEHALEM
+#define MODULE_PROC_FAMILY "NEHALEM "
+#elif defined CONFIG_MWESTMERE
+#define MODULE_PROC_FAMILY "WESTMERE "
+#elif defined CONFIG_MSILVERMONT
+#define MODULE_PROC_FAMILY "SILVERMONT "
+#elif defined CONFIG_MGOLDMONT
+#define MODULE_PROC_FAMILY "GOLDMONT "
+#elif defined CONFIG_MGOLDMONTPLUS
+#define MODULE_PROC_FAMILY "GOLDMONTPLUS "
+#elif defined CONFIG_MSANDYBRIDGE
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
+#elif defined CONFIG_MIVYBRIDGE
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
+#elif defined CONFIG_MHASWELL
+#define MODULE_PROC_FAMILY "HASWELL "
+#elif defined CONFIG_MBROADWELL
+#define MODULE_PROC_FAMILY "BROADWELL "
+#elif defined CONFIG_MSKYLAKE
+#define MODULE_PROC_FAMILY "SKYLAKE "
+#elif defined CONFIG_MSKYLAKEX
+#define MODULE_PROC_FAMILY "SKYLAKEX "
+#elif defined CONFIG_MCANNONLAKE
+#define MODULE_PROC_FAMILY "CANNONLAKE "
+#elif defined CONFIG_MICELAKE
+#define MODULE_PROC_FAMILY "ICELAKE "
+#elif defined CONFIG_MCASCADELAKE
+#define MODULE_PROC_FAMILY "CASCADELAKE "
+#elif defined CONFIG_MCOOPERLAKE
+#define MODULE_PROC_FAMILY "COOPERLAKE "
+#elif defined CONFIG_MTIGERLAKE
+#define MODULE_PROC_FAMILY "TIGERLAKE "
+#elif defined CONFIG_MSAPPHIRERAPIDS
+#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS "
+#elif defined CONFIG_ROCKETLAKE
+#define MODULE_PROC_FAMILY "ROCKETLAKE "
+#elif defined CONFIG_MALDERLAKE
+#define MODULE_PROC_FAMILY "ALDERLAKE "
+#elif defined CONFIG_MRAPTORLAKE
+#define MODULE_PROC_FAMILY "RAPTORLAKE "
+#elif defined CONFIG_MMETEORLAKE
+#define MODULE_PROC_FAMILY "METEORLAKE "
+#elif defined CONFIG_MEMERALDRAPIDS
+#define MODULE_PROC_FAMILY "EMERALDRAPIDS "
#elif defined CONFIG_MATOM
#define MODULE_PROC_FAMILY "ATOM "
#elif defined CONFIG_M686
@@ -35,6 +83,28 @@
#define MODULE_PROC_FAMILY "K7 "
#elif defined CONFIG_MK8
#define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MK8SSE3
+#define MODULE_PROC_FAMILY "K8SSE3 "
+#elif defined CONFIG_MK10
+#define MODULE_PROC_FAMILY "K10 "
+#elif defined CONFIG_MBARCELONA
+#define MODULE_PROC_FAMILY "BARCELONA "
+#elif defined CONFIG_MBOBCAT
+#define MODULE_PROC_FAMILY "BOBCAT "
+#elif defined CONFIG_MBULLDOZER
+#define MODULE_PROC_FAMILY "BULLDOZER "
+#elif defined CONFIG_MPILEDRIVER
+#define MODULE_PROC_FAMILY "PILEDRIVER "
+#elif defined CONFIG_MSTEAMROLLER
+#define MODULE_PROC_FAMILY "STEAMROLLER "
+#elif defined CONFIG_MJAGUAR
+#define MODULE_PROC_FAMILY "JAGUAR "
+#elif defined CONFIG_MEXCAVATOR
+#define MODULE_PROC_FAMILY "EXCAVATOR "
+#elif defined CONFIG_MZEN
+#define MODULE_PROC_FAMILY "ZEN "
+#elif defined CONFIG_MZEN2
+#define MODULE_PROC_FAMILY "ZEN2 "
#elif defined CONFIG_MELAN
#define MODULE_PROC_FAMILY "ELAN "
#elif defined CONFIG_MCRUSOE

View File

@ -0,0 +1,60 @@
From 44295ad130b8735cecb288dd7463a14892803d9b Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Tue, 1 Oct 2024 02:05:12 +0200
Subject: ZEN: Fixup graysky's more-ISA-levels-and-uarches
See: https://github.com/graysky2/kernel_compiler_patch/issues/105
---
arch/x86/Kconfig.cpu | 4 ----
arch/x86/Makefile | 4 ----
arch/x86/include/asm/vermagic.h | 6 ++++++
3 files changed, 6 insertions(+), 8 deletions(-)
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -603,10 +603,6 @@ config MNATIVE_AMD
endchoice
-config SUPPORT_MARCH_CODEVERS
- bool
- default y if (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
-
config X86_GENERIC
bool "Generic x86 support"
depends on X86_32
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -176,8 +176,6 @@ else
# FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
cflags-$(CONFIG_MK8) += -march=k8
cflags-$(CONFIG_MPSC) += -march=nocona
- cflags-$(CONFIG_MCORE2) += -march=core2
- cflags-$(CONFIG_MATOM) += -march=atom
ifeq ($(CONFIG_X86_64_VERSION),1)
cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic
rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic
@@ -229,8 +227,6 @@ else
rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8
rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona
- rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2
- rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom
rustflags-$(CONFIG_MK8SSE3) += -Ctarget-cpu=k8-sse3
rustflags-$(CONFIG_MK10) += -Ctarget-cpu=amdfam10
rustflags-$(CONFIG_MBARCELONA) += -Ctarget-cpu=barcelona
--- a/arch/x86/include/asm/vermagic.h
+++ b/arch/x86/include/asm/vermagic.h
@@ -105,6 +105,12 @@
#define MODULE_PROC_FAMILY "ZEN "
#elif defined CONFIG_MZEN2
#define MODULE_PROC_FAMILY "ZEN2 "
+#elif defined CONFIG_MZEN3
+#define MODULE_PROC_FAMILY "ZEN3 "
+#elif defined CONFIG_MZEN4
+#define MODULE_PROC_FAMILY "ZEN4 "
+#elif defined CONFIG_MZEN5
+#define MODULE_PROC_FAMILY "ZEN5 "
#elif defined CONFIG_MELAN
#define MODULE_PROC_FAMILY "ELAN "
#elif defined CONFIG_MCRUSOE

View File

@ -0,0 +1,40 @@
From 8dc948926f5b68b16a6a47a8f6e0a2154ac8ef3e Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org>
Date: Sun, 11 Dec 2022 23:51:16 +0100
Subject: ZEN: Restore CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3
This reverts a6036a41bffba3d5007e377483b425d470ad8042 (kbuild: drop
support for CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3), removes the
dependency on CONFIG_ARC and adds RUSTFLAGS.
---
Makefile | 3 +++
init/Kconfig | 6 ++++++
2 files changed, 9 insertions(+)
--- a/Makefile
+++ b/Makefile
@@ -814,6 +814,9 @@ KBUILD_CFLAGS += -fno-delete-null-pointe
ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
KBUILD_CFLAGS += -O2
KBUILD_RUSTFLAGS += -Copt-level=2
+else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
+KBUILD_CFLAGS += -O3
+KBUILD_RUSTFLAGS += -Copt-level=3
else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
KBUILD_CFLAGS += -Os
KBUILD_RUSTFLAGS += -Copt-level=s
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1407,6 +1407,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
with the "-O2" compiler flag for best performance and most
helpful compile-time warnings.
+config CC_OPTIMIZE_FOR_PERFORMANCE_O3
+ bool "Optimize more for performance (-O3)"
+ help
+ Choosing this option will pass "-O3" to your compiler to optimize
+ the kernel yet more for performance.
+
config CC_OPTIMIZE_FOR_SIZE
bool "Optimize for size (-Os)"
help

View File

@ -0,0 +1,11 @@
--- a/Makefile
+++ b/Makefile
@@ -815,7 +815,7 @@ ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
KBUILD_CFLAGS += -O2
KBUILD_RUSTFLAGS += -Copt-level=2
else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
-KBUILD_CFLAGS += -O3
+KBUILD_CFLAGS += -O3 $(call cc-option,-fivopts)
KBUILD_RUSTFLAGS += -Copt-level=3
else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
KBUILD_CFLAGS += -Os

View File

@ -0,0 +1,25 @@
From 3ebc1fdf3e0ee9bff1efe20eb5791eba5c84a810 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Thu, 3 Aug 2023 13:53:49 +0000
Subject: [PATCH 01/19] XANMOD: x86/build: Prevent generating avx2 and avx512
floating-point code
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
arch/x86/Makefile | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -70,9 +70,9 @@ export BITS
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f
KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
-KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
+KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-avx512f
#
# CFLAGS for compiling floating point code inside the kernel.

View File

@ -0,0 +1,11 @@
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -70,7 +70,7 @@ export BITS
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f -fno-tree-vectorize
KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-avx512f

View File

@ -0,0 +1,70 @@
From 3427331872c37b2edb42406c65764e1565b0591b Mon Sep 17 00:00:00 2001
From: Perry Yuan <perry.yuan@amd.com>
Date: Fri, 9 Aug 2024 14:09:05 +0800
Subject: cpufreq: amd-pstate: add quirk for Ryzen 3000 series processor
The Ryzen 3000 series processors have been observed lacking the
nominal_freq and lowest_freq parameters in their ACPI tables. This
absence causes issues with loading the amd-pstate driver on these
systems. Introduces a fix to resolve the dependency issue
by adding a quirk specifically for the Ryzen 3000 series.
Reported-by: David Wang <00107082@163.com>
Signed-off-by: Perry Yuan <perry.yuan@amd.com>
---
drivers/cpufreq/amd-pstate.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -142,6 +142,11 @@ static struct quirk_entry quirk_amd_7k62
.lowest_freq = 550,
};
+static struct quirk_entry quirk_amd_mts = {
+ .nominal_freq = 3600,
+ .lowest_freq = 550,
+};
+
static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
{
/**
@@ -158,6 +163,21 @@ static int __init dmi_matched_7k62_bios_
return 0;
}
+static int __init dmi_matched_mts_bios_bug(const struct dmi_system_id *dmi)
+{
+ /**
+ * match the broken bios for ryzen 3000 series processor support CPPC V2
+ * broken BIOS lack of nominal_freq and lowest_freq capabilities
+ * definition in ACPI tables
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2)) {
+ quirks = dmi->driver_data;
+ pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident);
+ return 1;
+ }
+
+ return 0;
+}
static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = {
{
.callback = dmi_matched_7k62_bios_bug,
@@ -168,6 +188,16 @@ static const struct dmi_system_id amd_ps
},
.driver_data = &quirk_amd_7k62,
},
+ {
+ .callback = dmi_matched_mts_bios_bug,
+ .ident = "AMD Ryzen 3000",
+ .matches = {
+ DMI_MATCH(DMI_PRODUCT_NAME, "B450M MORTAR MAX (MS-7B89)"),
+ DMI_MATCH(DMI_BIOS_RELEASE, "06/10/2020"),
+ DMI_MATCH(DMI_BIOS_VERSION, "5.14"),
+ },
+ .driver_data = &quirk_amd_mts,
+ },
{}
};
MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table);

View File

@ -0,0 +1,88 @@
From 44f21855901b1fd618ac16b07dbd14e8fea4ee13 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 31 Aug 2024 21:49:11 -0500
Subject: cpufreq/amd-pstate: Export symbols for changing modes
In order to effectively test all mode switch combinations export
everything necessarily for amd-pstate-ut to trigger a mode switch.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 23 ++++++++++-------------
drivers/cpufreq/amd-pstate.h | 14 ++++++++++++++
2 files changed, 24 insertions(+), 13 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -60,18 +60,6 @@
#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF
#define AMD_CPPC_EPP_POWERSAVE 0xFF
-/*
- * enum amd_pstate_mode - driver working mode of amd pstate
- */
-enum amd_pstate_mode {
- AMD_PSTATE_UNDEFINED = 0,
- AMD_PSTATE_DISABLE,
- AMD_PSTATE_PASSIVE,
- AMD_PSTATE_ACTIVE,
- AMD_PSTATE_GUIDED,
- AMD_PSTATE_MAX,
-};
-
static const char * const amd_pstate_mode_string[] = {
[AMD_PSTATE_UNDEFINED] = "undefined",
[AMD_PSTATE_DISABLE] = "disable",
@@ -81,6 +69,14 @@ static const char * const amd_pstate_mod
NULL,
};
+const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode)
+{
+ if (mode < 0 || mode >= AMD_PSTATE_MAX)
+ return NULL;
+ return amd_pstate_mode_string[mode];
+}
+EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string);
+
struct quirk_entry {
u32 nominal_freq;
u32 lowest_freq;
@@ -1392,7 +1388,7 @@ static ssize_t amd_pstate_show_status(ch
return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
}
-static int amd_pstate_update_status(const char *buf, size_t size)
+int amd_pstate_update_status(const char *buf, size_t size)
{
int mode_idx;
@@ -1409,6 +1405,7 @@ static int amd_pstate_update_status(cons
return 0;
}
+EXPORT_SYMBOL_GPL(amd_pstate_update_status);
static ssize_t status_show(struct device *dev,
struct device_attribute *attr, char *buf)
--- a/drivers/cpufreq/amd-pstate.h
+++ b/drivers/cpufreq/amd-pstate.h
@@ -103,4 +103,18 @@ struct amd_cpudata {
bool boost_state;
};
+/*
+ * enum amd_pstate_mode - driver working mode of amd pstate
+ */
+enum amd_pstate_mode {
+ AMD_PSTATE_UNDEFINED = 0,
+ AMD_PSTATE_DISABLE,
+ AMD_PSTATE_PASSIVE,
+ AMD_PSTATE_ACTIVE,
+ AMD_PSTATE_GUIDED,
+ AMD_PSTATE_MAX,
+};
+const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode);
+int amd_pstate_update_status(const char *buf, size_t size);
+
#endif /* _LINUX_AMD_PSTATE_H */

View File

@ -0,0 +1,77 @@
From aabfc7370a7da9c52be97c79ba70a20201e6864a Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 31 Aug 2024 21:49:12 -0500
Subject: cpufreq/amd-pstate-ut: Add test case for mode switches
There is a state machine in the amd-pstate driver utilized for
switches for all modes. To make sure that cleanup and setup works
properly for each mode add a unit test case that tries all
combinations.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate-ut.c | 41 ++++++++++++++++++++++++++++++++-
1 file changed, 40 insertions(+), 1 deletion(-)
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -54,12 +54,14 @@ static void amd_pstate_ut_acpi_cpc_valid
static void amd_pstate_ut_check_enabled(u32 index);
static void amd_pstate_ut_check_perf(u32 index);
static void amd_pstate_ut_check_freq(u32 index);
+static void amd_pstate_ut_check_driver(u32 index);
static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = {
{"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid },
{"amd_pstate_ut_check_enabled", amd_pstate_ut_check_enabled },
{"amd_pstate_ut_check_perf", amd_pstate_ut_check_perf },
- {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq }
+ {"amd_pstate_ut_check_freq", amd_pstate_ut_check_freq },
+ {"amd_pstate_ut_check_driver", amd_pstate_ut_check_driver }
};
static bool get_shared_mem(void)
@@ -257,6 +259,43 @@ skip_test:
cpufreq_cpu_put(policy);
}
+static int amd_pstate_set_mode(enum amd_pstate_mode mode)
+{
+ const char *mode_str = amd_pstate_get_mode_string(mode);
+
+ pr_debug("->setting mode to %s\n", mode_str);
+
+ return amd_pstate_update_status(mode_str, strlen(mode_str));
+}
+
+static void amd_pstate_ut_check_driver(u32 index)
+{
+ enum amd_pstate_mode mode1, mode2;
+ int ret;
+
+ for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {
+ ret = amd_pstate_set_mode(mode1);
+ if (ret)
+ goto out;
+ for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) {
+ if (mode1 == mode2)
+ continue;
+ ret = amd_pstate_set_mode(mode2);
+ if (ret)
+ goto out;
+ }
+ }
+out:
+ if (ret)
+ pr_warn("%s: failed to update status for %s->%s: %d\n", __func__,
+ amd_pstate_get_mode_string(mode1),
+ amd_pstate_get_mode_string(mode2), ret);
+
+ amd_pstate_ut_cases[index].result = ret ?
+ AMD_PSTATE_UT_RESULT_FAIL :
+ AMD_PSTATE_UT_RESULT_PASS;
+}
+
static int __init amd_pstate_ut_init(void)
{
u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases);

View File

@ -0,0 +1,60 @@
From 24e62fbc101d079d398ac6fc76f458676d3d9491 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sun, 1 Sep 2024 00:00:35 -0500
Subject: cpufreq/amd-pstate: Catch failures for amd_pstate_epp_update_limit()
amd_pstate_set_epp() calls cppc_set_epp_perf() which can fail for
a variety of reasons but this is ignored. Change the return flow
to allow failures.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1595,7 +1595,7 @@ static void amd_pstate_epp_cpu_exit(stru
pr_debug("CPU %d exiting\n", policy->cpu);
}
-static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
+static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
@@ -1645,7 +1645,7 @@ static void amd_pstate_epp_update_limit(
* This return value can only be negative for shared_memory
* systems where EPP register read/write not supported.
*/
- return;
+ return epp;
}
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
@@ -1658,12 +1658,13 @@ static void amd_pstate_epp_update_limit(
}
WRITE_ONCE(cpudata->cppc_req_cached, value);
- amd_pstate_set_epp(cpudata, epp);
+ return amd_pstate_set_epp(cpudata, epp);
}
static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
+ int ret;
if (!policy->cpuinfo.max_freq)
return -ENODEV;
@@ -1673,7 +1674,9 @@ static int amd_pstate_epp_set_policy(str
cpudata->policy = policy->policy;
- amd_pstate_epp_update_limit(policy);
+ ret = amd_pstate_epp_update_limit(policy);
+ if (ret)
+ return ret;
/*
* policy->cur is never updated with the amd_pstate_epp driver, but it

View File

@ -0,0 +1,67 @@
From 29c0347dd542e091e2f7e5980dd885f918f5f676 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:29:57 -0500
Subject: x86/amd: Move amd_get_highest_perf() from amd.c to cppc.c
To prepare to let amd_get_highest_perf() detect preferred cores
it will require CPPC functions. Move amd_get_highest_perf() to
cppc.c to prepare for 'preferred core detection' rework.
No functional changes intended.
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 16 ++++++++++++++++
arch/x86/kernel/cpu/amd.c | 16 ----------------
2 files changed, 16 insertions(+), 16 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -116,3 +116,19 @@ void init_freq_invariance_cppc(void)
init_done = true;
mutex_unlock(&freq_invariance_lock);
}
+
+u32 amd_get_highest_perf(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
+ (c->x86_model >= 0x70 && c->x86_model < 0x80)))
+ return 166;
+
+ if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
+ (c->x86_model >= 0x40 && c->x86_model < 0x70)))
+ return 166;
+
+ return 255;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1190,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsig
}
EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
-u32 amd_get_highest_perf(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
- return 166;
-
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
- return 166;
-
- return 255;
-}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
-
static void zenbleed_check_cpu(void *unused)
{
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());

View File

@ -0,0 +1,95 @@
From 072efeb45349edd8ba9def11b6a450eaf56690a8 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:29:58 -0500
Subject: ACPI: CPPC: Adjust return code for inline functions in
!CONFIG_ACPI_CPPC_LIB
Checkpath emits the following warning:
```
WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP
```
Adjust the code accordingly.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
include/acpi/cppc_acpi.h | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -164,31 +164,31 @@ extern int cppc_set_auto_sel(int cpu, bo
#else /* !CONFIG_ACPI_CPPC_LIB */
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_set_enable(int cpu, bool enable)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline bool cppc_perf_ctrs_in_pcc(void)
{
@@ -212,27 +212,27 @@ static inline bool cpc_ffh_supported(voi
}
static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_set_auto_sel(int cpu, bool enable)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
{
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
#endif /* !CONFIG_ACPI_CPPC_LIB */

View File

@ -0,0 +1,162 @@
From 21492d91ffc7c3fdb6507f64a74abf8326c75141 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:29:59 -0500
Subject: x86/amd: Rename amd_get_highest_perf() to
amd_get_boost_ratio_numerator()
The function name is ambiguous because it returns an intermediate value
for calculating maximum frequency rather than the CPPC 'Highest Perf'
register.
Rename the function to clarify its use and allow the function to return
errors. Adjust the consumer in acpi-cpufreq to catch errors.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
arch/x86/include/asm/processor.h | 3 ---
arch/x86/kernel/acpi/cppc.c | 44 +++++++++++++++++++++++---------
drivers/cpufreq/acpi-cpufreq.c | 12 ++++++---
include/acpi/cppc_acpi.h | 5 ++++
4 files changed, 46 insertions(+), 18 deletions(-)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -691,8 +691,6 @@ static inline u32 per_cpu_l2c_id(unsigne
}
#ifdef CONFIG_CPU_SUP_AMD
-extern u32 amd_get_highest_perf(void);
-
/*
* Issue a DIV 0/1 insn to clear any division data from previous DIV
* operations.
@@ -705,7 +703,6 @@ static __always_inline void amd_clear_di
extern void amd_check_microcode(void);
#else
-static inline u32 amd_get_highest_perf(void) { return 0; }
static inline void amd_clear_divider(void) { }
static inline void amd_check_microcode(void) { }
#endif
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -69,7 +69,7 @@ int cpc_write_ffh(int cpunum, struct cpc
static void amd_set_max_freq_ratio(void)
{
struct cppc_perf_caps perf_caps;
- u64 highest_perf, nominal_perf;
+ u64 numerator, nominal_perf;
u64 perf_ratio;
int rc;
@@ -79,15 +79,19 @@ static void amd_set_max_freq_ratio(void)
return;
}
- highest_perf = amd_get_highest_perf();
+ rc = amd_get_boost_ratio_numerator(0, &numerator);
+ if (rc) {
+ pr_debug("Could not retrieve highest performance (%d)\n", rc);
+ return;
+ }
nominal_perf = perf_caps.nominal_perf;
- if (!highest_perf || !nominal_perf) {
- pr_debug("Could not retrieve highest or nominal performance\n");
+ if (!nominal_perf) {
+ pr_debug("Could not retrieve nominal performance\n");
return;
}
- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
+ perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
/* midpoint between max_boost and max_P */
perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
if (!perf_ratio) {
@@ -117,18 +121,34 @@ void init_freq_invariance_cppc(void)
mutex_unlock(&freq_invariance_lock);
}
-u32 amd_get_highest_perf(void)
+/**
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+ * @cpu: CPU to get numerator for.
+ * @numerator: Output variable for numerator.
+ *
+ * Determine the numerator to use for calculating the boost ratio on
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
+ * value. On other systems this will the highest performance register value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
- return 166;
+ (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
+ *numerator = 166;
+ return 0;
+ }
if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
- return 166;
+ (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
+ *numerator = 166;
+ return 0;
+ }
+ *numerator = 255;
- return 255;
+ return 0;
}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -642,10 +642,16 @@ static u64 get_max_boost_ratio(unsigned
return 0;
}
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
- highest_perf = amd_get_highest_perf();
- else
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ ret = amd_get_boost_ratio_numerator(cpu, &highest_perf);
+ if (ret) {
+ pr_debug("CPU%d: Unable to get boost ratio numerator (%d)\n",
+ cpu, ret);
+ return 0;
+ }
+ } else {
highest_perf = perf_caps.highest_perf;
+ }
nominal_perf = perf_caps.nominal_perf;
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum,
extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
extern int cppc_set_auto_sel(int cpu, bool enable);
+extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
#else /* !CONFIG_ACPI_CPPC_LIB */
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
{
@@ -234,6 +235,10 @@ static inline int cppc_get_auto_sel_caps
{
return -EOPNOTSUPP;
}
+static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+{
+ return -EOPNOTSUPP;
+}
#endif /* !CONFIG_ACPI_CPPC_LIB */
#endif /* _CPPC_ACPI_H*/

View File

@ -0,0 +1,35 @@
From 6f10d066dce0f1781b514a0352f0b427a32b1bb2 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:00 -0500
Subject: ACPI: CPPC: Drop check for non zero perf ratio
perf_ratio is a u64 and SCHED_CAPACITY_SCALE is a large number.
Shifting by one will never have a zero value.
Drop the check.
Suggested-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -91,13 +91,8 @@ static void amd_set_max_freq_ratio(void)
return;
}
- perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
/* midpoint between max_boost and max_P */
- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
- if (!perf_ratio) {
- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
- return;
- }
+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
freq_invariance_set_perf_ratio(perf_ratio, false);
}

View File

@ -0,0 +1,44 @@
From 8c142a91a58f24119e99d4e66b11890f4a4ef984 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:01 -0500
Subject: ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn
If the boost ratio isn't calculated properly for the system for any
reason this can cause other problems that are non-obvious.
Raise all messages to warn instead.
Suggested-by: Perry Yuan <Perry.Yuan@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -75,19 +75,19 @@ static void amd_set_max_freq_ratio(void)
rc = cppc_get_perf_caps(0, &perf_caps);
if (rc) {
- pr_debug("Could not retrieve perf counters (%d)\n", rc);
+ pr_warn("Could not retrieve perf counters (%d)\n", rc);
return;
}
rc = amd_get_boost_ratio_numerator(0, &numerator);
if (rc) {
- pr_debug("Could not retrieve highest performance (%d)\n", rc);
+ pr_warn("Could not retrieve highest performance (%d)\n", rc);
return;
}
nominal_perf = perf_caps.nominal_perf;
if (!nominal_perf) {
- pr_debug("Could not retrieve nominal performance\n");
+ pr_warn("Could not retrieve nominal performance\n");
return;
}

View File

@ -0,0 +1,138 @@
From 952e7bdc4cf67603f230f8eb91818ad4676e5a83 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:02 -0500
Subject: x86/amd: Move amd_get_highest_perf() out of amd-pstate
amd_pstate_get_highest_perf() is a helper used to get the highest perf
value on AMD systems. It's used in amd-pstate as part of preferred
core handling, but applicable for acpi-cpufreq as well.
Move it out to cppc handling code as amd_get_highest_perf().
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 30 ++++++++++++++++++++++++++++++
drivers/cpufreq/amd-pstate.c | 34 ++--------------------------------
include/acpi/cppc_acpi.h | 5 +++++
3 files changed, 37 insertions(+), 32 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -116,6 +116,36 @@ void init_freq_invariance_cppc(void)
mutex_unlock(&freq_invariance_lock);
}
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address for highest performance value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ u64 val;
+ int ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+ if (ret)
+ goto out;
+
+ val = AMD_CPPC_HIGHEST_PERF(val);
+ } else {
+ ret = cppc_get_highest_perf(cpu, &val);
+ if (ret)
+ goto out;
+ }
+
+ WRITE_ONCE(*highest_perf, (u32)val);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
/**
* amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
* @cpu: CPU to get numerator for.
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -837,36 +837,6 @@ static void amd_pstste_sched_prefcore_wo
}
static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
-/*
- * Get the highest performance register value.
- * @cpu: CPU from which to get highest performance.
- * @highest_perf: Return address.
- *
- * Return: 0 for success, -EIO otherwise.
- */
-static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf)
-{
- int ret;
-
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
- u64 cap1;
-
- ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
- if (ret)
- return ret;
- WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
- } else {
- u64 cppc_highest_perf;
-
- ret = cppc_get_highest_perf(cpu, &cppc_highest_perf);
- if (ret)
- return ret;
- WRITE_ONCE(*highest_perf, cppc_highest_perf);
- }
-
- return (ret);
-}
-
#define CPPC_MAX_PERF U8_MAX
static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
@@ -874,7 +844,7 @@ static void amd_pstate_init_prefcore(str
int ret, prio;
u32 highest_perf;
- ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf);
+ ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
if (ret)
return;
@@ -918,7 +888,7 @@ static void amd_pstate_update_limits(uns
if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
goto free_cpufreq_put;
- ret = amd_pstate_get_highest_perf(cpu, &cur_high);
+ ret = amd_get_highest_perf(cpu, &cur_high);
if (ret)
goto free_cpufreq_put;
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -161,6 +161,7 @@ extern int cppc_get_epp_perf(int cpunum,
extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
extern int cppc_set_auto_sel(int cpu, bool enable);
+extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
#else /* !CONFIG_ACPI_CPPC_LIB */
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
@@ -235,6 +236,10 @@ static inline int cppc_get_auto_sel_caps
{
return -EOPNOTSUPP;
}
+static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ return -ENODEV;
+}
static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
{
return -EOPNOTSUPP;

View File

@ -0,0 +1,251 @@
From 3ab7da5bbf2087982dbfe2b0f2937d0dddc3afb1 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:03 -0500
Subject: x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()
AMD systems that support preferred cores will use "166" as their
numerator for max frequency calculations instead of "255".
Add a function for detecting preferred cores by looking at the
highest perf value on all cores.
If preferred cores are enabled return 166 and if disabled the
value in the highest perf register. As the function will be called
multiple times, cache the values for the boost numerator and if
preferred cores will be enabled in global variables.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
arch/x86/kernel/acpi/cppc.c | 93 ++++++++++++++++++++++++++++++++----
drivers/cpufreq/amd-pstate.c | 34 +++++--------
include/acpi/cppc_acpi.h | 5 ++
3 files changed, 101 insertions(+), 31 deletions(-)
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,16 @@
#include <asm/processor.h>
#include <asm/topology.h>
+#define CPPC_HIGHEST_PERF_PREFCORE 166
+
+enum amd_pref_core {
+ AMD_PREF_CORE_UNKNOWN = 0,
+ AMD_PREF_CORE_SUPPORTED,
+ AMD_PREF_CORE_UNSUPPORTED,
+};
+static enum amd_pref_core amd_pref_core_detected;
+static u64 boost_numerator;
+
/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
bool cpc_supported_by_cpu(void)
@@ -147,6 +157,66 @@ out:
EXPORT_SYMBOL_GPL(amd_get_highest_perf);
/**
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
+ * @detected: Output variable for the result of the detection.
+ *
+ * Determine whether CPUs in the system support preferred cores. On systems
+ * that support preferred cores, different highest perf values will be found
+ * on different cores. On other systems, the highest perf value will be the
+ * same on all cores.
+ *
+ * The result of the detection will be stored in the 'detected' parameter.
+ *
+ * Return: 0 for success, negative error code otherwise
+ */
+int amd_detect_prefcore(bool *detected)
+{
+ int cpu, count = 0;
+ u64 highest_perf[2] = {0};
+
+ if (WARN_ON(!detected))
+ return -EINVAL;
+
+ switch (amd_pref_core_detected) {
+ case AMD_PREF_CORE_SUPPORTED:
+ *detected = true;
+ return 0;
+ case AMD_PREF_CORE_UNSUPPORTED:
+ *detected = false;
+ return 0;
+ default:
+ break;
+ }
+
+ for_each_present_cpu(cpu) {
+ u32 tmp;
+ int ret;
+
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+
+ if (!count || (count == 1 && tmp != highest_perf[0]))
+ highest_perf[count++] = tmp;
+
+ if (count == 2)
+ break;
+ }
+
+ *detected = (count == 2);
+ boost_numerator = highest_perf[0];
+
+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
+ AMD_PREF_CORE_UNSUPPORTED;
+
+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
+ *detected ? "" : "un", highest_perf[0]);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+
+/**
* amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
* @cpu: CPU to get numerator for.
* @numerator: Output variable for numerator.
@@ -155,24 +225,27 @@ EXPORT_SYMBOL_GPL(amd_get_highest_perf);
* a CPU. On systems that support preferred cores, this will be a hardcoded
* value. On other systems this will the highest performance register value.
*
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
+ * the correct boost numerator will be returned to match hardware capabilities
+ * even if the preferred cores scheduling hints are not enabled.
+ *
* Return: 0 for success, negative error code otherwise.
*/
int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
- *numerator = 166;
- return 0;
- }
+ bool prefcore;
+ int ret;
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
- *numerator = 166;
+ ret = amd_detect_prefcore(&prefcore);
+ if (ret)
+ return ret;
+
+ /* without preferred cores, return the highest perf register value */
+ if (!prefcore) {
+ *numerator = boost_numerator;
return 0;
}
- *numerator = 255;
+ *numerator = CPPC_HIGHEST_PERF_PREFCORE;
return 0;
}
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -841,32 +841,18 @@ static DECLARE_WORK(sched_prefcore_work,
static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
{
- int ret, prio;
- u32 highest_perf;
-
- ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
- if (ret)
+ /* user disabled or not detected */
+ if (!amd_pstate_prefcore)
return;
cpudata->hw_prefcore = true;
- /* check if CPPC preferred core feature is enabled*/
- if (highest_perf < CPPC_MAX_PERF)
- prio = (int)highest_perf;
- else {
- pr_debug("AMD CPPC preferred core is unsupported!\n");
- cpudata->hw_prefcore = false;
- return;
- }
-
- if (!amd_pstate_prefcore)
- return;
/*
* The priorities can be set regardless of whether or not
* sched_set_itmt_support(true) has been called and it is valid to
* update them at any time after it has been called.
*/
- sched_set_itmt_core_prio(prio, cpudata->cpu);
+ sched_set_itmt_core_prio((int)READ_ONCE(cpudata->highest_perf), cpudata->cpu);
schedule_work(&sched_prefcore_work);
}
@@ -1037,12 +1023,12 @@ static int amd_pstate_cpu_init(struct cp
cpudata->cpu = policy->cpu;
- amd_pstate_init_prefcore(cpudata);
-
ret = amd_pstate_init_perf(cpudata);
if (ret)
goto free_cpudata1;
+ amd_pstate_init_prefcore(cpudata);
+
ret = amd_pstate_init_freq(cpudata);
if (ret)
goto free_cpudata1;
@@ -1493,12 +1479,12 @@ static int amd_pstate_epp_cpu_init(struc
cpudata->cpu = policy->cpu;
cpudata->epp_policy = 0;
- amd_pstate_init_prefcore(cpudata);
-
ret = amd_pstate_init_perf(cpudata);
if (ret)
goto free_cpudata1;
+ amd_pstate_init_prefcore(cpudata);
+
ret = amd_pstate_init_freq(cpudata);
if (ret)
goto free_cpudata1;
@@ -1960,6 +1946,12 @@ static int __init amd_pstate_init(void)
static_call_update(amd_pstate_update_perf, cppc_update_perf);
}
+ if (amd_pstate_prefcore) {
+ ret = amd_detect_prefcore(&amd_pstate_prefcore);
+ if (ret)
+ return ret;
+ }
+
/* enable amd pstate feature */
ret = amd_pstate_enable(true);
if (ret) {
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -163,6 +163,7 @@ extern int cppc_get_auto_sel_caps(int cp
extern int cppc_set_auto_sel(int cpu, bool enable);
extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
+extern int amd_detect_prefcore(bool *detected);
#else /* !CONFIG_ACPI_CPPC_LIB */
static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
{
@@ -244,6 +245,10 @@ static inline int amd_get_boost_ratio_nu
{
return -EOPNOTSUPP;
}
+static inline int amd_detect_prefcore(bool *detected)
+{
+ return -ENODEV;
+}
#endif /* !CONFIG_ACPI_CPPC_LIB */
#endif /* _CPPC_ACPI_H*/

View File

@ -0,0 +1,169 @@
From 68d89574b86625f4bd7a784fe9bcc221dc290e4f Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:04 -0500
Subject: cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into
amd_get_boost_ratio_numerator()
The special case in amd_pstate_highest_perf_set() is the value used
for calculating the boost numerator. Merge this into
amd_get_boost_ratio_numerator() and then use that to calculate boost
ratio.
This allows dropping more special casing of the highest perf value.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
Documentation/admin-guide/pm/amd-pstate.rst | 3 +-
arch/x86/kernel/acpi/cppc.c | 16 +++++++
drivers/cpufreq/amd-pstate.c | 52 ++++-----------------
3 files changed, 28 insertions(+), 43 deletions(-)
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -251,7 +251,8 @@ performance supported in `AMD CPPC Perfo
In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
table, so we need to expose it to sysfs. If boost is not active, but
still supported, this maximum frequency will be larger than the one in
-``cpuinfo``.
+``cpuinfo``. On systems that support preferred core, the driver will have
+different values for some cores than others.
This attribute is read-only.
``amd_pstate_lowest_nonlinear_freq``
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,7 @@
#include <asm/processor.h>
#include <asm/topology.h>
+#define CPPC_HIGHEST_PERF_PERFORMANCE 196
#define CPPC_HIGHEST_PERF_PREFCORE 166
enum amd_pref_core {
@@ -245,6 +246,21 @@ int amd_get_boost_ratio_numerator(unsign
*numerator = boost_numerator;
return 0;
}
+
+ /*
+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+ * the highest performance level is set to 196.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+ switch (boot_cpu_data.x86_model) {
+ case 0x70 ... 0x7f:
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ default:
+ break;
+ }
+ }
*numerator = CPPC_HIGHEST_PERF_PREFCORE;
return 0;
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -52,8 +52,6 @@
#define AMD_PSTATE_TRANSITION_LATENCY 20000
#define AMD_PSTATE_TRANSITION_DELAY 1000
#define AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY 600
-#define CPPC_HIGHEST_PERF_PERFORMANCE 196
-#define CPPC_HIGHEST_PERF_DEFAULT 166
#define AMD_CPPC_EPP_PERFORMANCE 0x00
#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80
@@ -398,43 +396,17 @@ static inline int amd_pstate_enable(bool
return static_call(amd_pstate_enable)(enable);
}
-static u32 amd_pstate_highest_perf_set(struct amd_cpudata *cpudata)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /*
- * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
- * the highest performance level is set to 196.
- * https://bugzilla.kernel.org/show_bug.cgi?id=218759
- */
- if (c->x86 == 0x19 && (c->x86_model >= 0x70 && c->x86_model <= 0x7f))
- return CPPC_HIGHEST_PERF_PERFORMANCE;
-
- return CPPC_HIGHEST_PERF_DEFAULT;
-}
-
static int pstate_init_perf(struct amd_cpudata *cpudata)
{
u64 cap1;
- u32 highest_perf;
int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
&cap1);
if (ret)
return ret;
- /* For platforms that do not support the preferred core feature, the
- * highest_pef may be configured with 166 or 255, to avoid max frequency
- * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as
- * the default max perf.
- */
- if (cpudata->hw_prefcore)
- highest_perf = amd_pstate_highest_perf_set(cpudata);
- else
- highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
-
- WRITE_ONCE(cpudata->highest_perf, highest_perf);
- WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
+ WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+ WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1));
WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
@@ -446,19 +418,13 @@ static int pstate_init_perf(struct amd_c
static int cppc_init_perf(struct amd_cpudata *cpudata)
{
struct cppc_perf_caps cppc_perf;
- u32 highest_perf;
int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
if (ret)
return ret;
- if (cpudata->hw_prefcore)
- highest_perf = amd_pstate_highest_perf_set(cpudata);
- else
- highest_perf = cppc_perf.highest_perf;
-
- WRITE_ONCE(cpudata->highest_perf, highest_perf);
- WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
+ WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
+ WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf);
WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
WRITE_ONCE(cpudata->lowest_nonlinear_perf,
cppc_perf.lowest_nonlinear_perf);
@@ -944,8 +910,8 @@ static u32 amd_pstate_get_transition_lat
static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
{
int ret;
- u32 min_freq;
- u32 highest_perf, max_freq;
+ u32 min_freq, max_freq;
+ u64 numerator;
u32 nominal_perf, nominal_freq;
u32 lowest_nonlinear_perf, lowest_nonlinear_freq;
u32 boost_ratio, lowest_nonlinear_ratio;
@@ -967,8 +933,10 @@ static int amd_pstate_init_freq(struct a
nominal_perf = READ_ONCE(cpudata->nominal_perf);
- highest_perf = READ_ONCE(cpudata->highest_perf);
- boost_ratio = div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
+ ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
+ if (ret)
+ return ret;
+ boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf);
max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);

View File

@ -0,0 +1,42 @@
From deed718125e73b6bf280dcebb80c39108226388c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:05 -0500
Subject: cpufreq: amd-pstate: Optimize amd_pstate_update_limits()
Don't take and release the mutex when prefcore isn't present and
avoid initialization of variables that will be initially set
in the function.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
drivers/cpufreq/amd-pstate.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -836,17 +836,17 @@ static void amd_pstate_update_limits(uns
cpudata = policy->driver_data;
- mutex_lock(&amd_pstate_driver_lock);
- if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
- goto free_cpufreq_put;
+ if (!amd_pstate_prefcore)
+ return;
+ mutex_lock(&amd_pstate_driver_lock);
ret = amd_get_highest_perf(cpu, &cur_high);
if (ret)
goto free_cpufreq_put;
prev_high = READ_ONCE(cpudata->prefcore_ranking);
- if (prev_high != cur_high) {
- highest_perf_changed = true;
+ highest_perf_changed = (prev_high != cur_high);
+ if (highest_perf_changed) {
WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
if (cur_high < CPPC_MAX_PERF)

View File

@ -0,0 +1,29 @@
From 391075a34e392c7cacd338a6b034a21a10679855 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:06 -0500
Subject: cpufreq: amd-pstate: Add documentation for `amd_pstate_hw_prefcore`
Explain that the sysfs file represents both preferred core being
enabled by the user and supported by the hardware.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
Documentation/admin-guide/pm/amd-pstate.rst | 5 +++++
1 file changed, 5 insertions(+)
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -263,6 +263,11 @@ lowest non-linear performance in `AMD CP
<perf_cap_>`_.)
This attribute is read-only.
+``amd_pstate_hw_prefcore``
+
+Whether the platform supports the preferred core feature and it has been
+enabled. This attribute is read-only.
+
``energy_performance_available_preferences``
A list of all the supported EPP preferences that could be used for

View File

@ -0,0 +1,42 @@
From 2ed9874f6dcafcc2bee7a922af9e1d1c62dbeb18 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:30:07 -0500
Subject: amd-pstate: Add missing documentation for
`amd_pstate_prefcore_ranking`
`amd_pstate_prefcore_ranking` reflects the dynamic rankings of a CPU
core based on platform conditions. Explicitly include it in the
documentation.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
---
Documentation/admin-guide/pm/amd-pstate.rst | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -252,7 +252,8 @@ In some ASICs, the highest CPPC performa
table, so we need to expose it to sysfs. If boost is not active, but
still supported, this maximum frequency will be larger than the one in
``cpuinfo``. On systems that support preferred core, the driver will have
-different values for some cores than others.
+different values for some cores than others and this will reflect the values
+advertised by the platform at bootup.
This attribute is read-only.
``amd_pstate_lowest_nonlinear_freq``
@@ -268,6 +269,12 @@ This attribute is read-only.
Whether the platform supports the preferred core feature and it has been
enabled. This attribute is read-only.
+``amd_pstate_prefcore_ranking``
+
+The performance ranking of the core. This number doesn't have any unit, but
+larger numbers are preferred at the time of reading. This can change at
+runtime based on platform conditions. This attribute is read-only.
+
``energy_performance_available_preferences``
A list of all the supported EPP preferences that could be used for

View File

@ -0,0 +1,24 @@
From 2e2ba39aec71fb51e897c3275b255ef806800cf0 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 5 Sep 2024 11:23:51 -0500
Subject: cpufreq/amd-pstate: Fix non kerneldoc comment
The comment for amd_cppc_supported() isn't meant to be kernel doc.
Fixes: cb817ec6673b7 ("cpufreq: amd-pstate: show CPPC debug message if CPPC is not supported")
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1786,7 +1786,7 @@ static int __init amd_pstate_set_driver(
return -EINVAL;
}
-/**
+/*
* CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F.
* show the debug message that helps to check if the CPU has CPPC support for loading issue.
*/

View File

@ -0,0 +1,24 @@
From 185e64a7e1a749593f3d6dadc666da9dda82d48c Mon Sep 17 00:00:00 2001
From: Qianqiang Liu <qianqiang.liu@163.com>
Date: Wed, 11 Sep 2024 07:39:24 +0800
Subject: cpufreq/amd-pstate-ut: Fix an "Uninitialized variables" issue
Using uninitialized value "mode2" when calling "amd_pstate_get_mode_string".
Set "mode2" to "AMD_PSTATE_DISABLE" by default.
Signed-off-by: Qianqiang Liu <qianqiang.liu@163.com>
---
drivers/cpufreq/amd-pstate-ut.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -270,7 +270,7 @@ static int amd_pstate_set_mode(enum amd_
static void amd_pstate_ut_check_driver(u32 index)
{
- enum amd_pstate_mode mode1, mode2;
+ enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE;
int ret;
for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {

View File

@ -0,0 +1,108 @@
From d74ce254cc470da670d6b90c69bab553cdbde62b Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Tue, 17 Sep 2024 09:14:35 +0000
Subject: cpufreq/amd-pstate: Rename MSR and shared memory specific functions
Existing function names "cppc_*" and "pstate_*" for shared memory and
MSR based systems are not intuitive enough, replace them with "shmem_*" and
"msr_*" respectively.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
drivers/cpufreq/amd-pstate.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -263,7 +263,7 @@ static int amd_pstate_get_energy_pref_in
return index;
}
-static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
+static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
u32 des_perf, u32 max_perf, bool fast_switch)
{
if (fast_switch)
@@ -273,7 +273,7 @@ static void pstate_update_perf(struct am
READ_ONCE(cpudata->cppc_req_cached));
}
-DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf);
+DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf);
static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata,
u32 min_perf, u32 des_perf,
@@ -336,7 +336,7 @@ static int amd_pstate_set_energy_pref_in
return ret;
}
-static inline int pstate_enable(bool enable)
+static inline int msr_enable(bool enable)
{
int ret, cpu;
unsigned long logical_proc_id_mask = 0;
@@ -362,7 +362,7 @@ static inline int pstate_enable(bool ena
return 0;
}
-static int cppc_enable(bool enable)
+static int shmem_enable(bool enable)
{
int cpu, ret = 0;
struct cppc_perf_ctrls perf_ctrls;
@@ -389,14 +389,14 @@ static int cppc_enable(bool enable)
return ret;
}
-DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable);
+DEFINE_STATIC_CALL(amd_pstate_enable, msr_enable);
static inline int amd_pstate_enable(bool enable)
{
return static_call(amd_pstate_enable)(enable);
}
-static int pstate_init_perf(struct amd_cpudata *cpudata)
+static int msr_init_perf(struct amd_cpudata *cpudata)
{
u64 cap1;
@@ -415,7 +415,7 @@ static int pstate_init_perf(struct amd_c
return 0;
}
-static int cppc_init_perf(struct amd_cpudata *cpudata)
+static int shmem_init_perf(struct amd_cpudata *cpudata)
{
struct cppc_perf_caps cppc_perf;
@@ -450,14 +450,14 @@ static int cppc_init_perf(struct amd_cpu
return ret;
}
-DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
+DEFINE_STATIC_CALL(amd_pstate_init_perf, msr_init_perf);
static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata)
{
return static_call(amd_pstate_init_perf)(cpudata);
}
-static void cppc_update_perf(struct amd_cpudata *cpudata,
+static void shmem_update_perf(struct amd_cpudata *cpudata,
u32 min_perf, u32 des_perf,
u32 max_perf, bool fast_switch)
{
@@ -1909,9 +1909,9 @@ static int __init amd_pstate_init(void)
current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
} else {
pr_debug("AMD CPPC shared memory based functionality is supported\n");
- static_call_update(amd_pstate_enable, cppc_enable);
- static_call_update(amd_pstate_init_perf, cppc_init_perf);
- static_call_update(amd_pstate_update_perf, cppc_update_perf);
+ static_call_update(amd_pstate_enable, shmem_enable);
+ static_call_update(amd_pstate_init_perf, shmem_init_perf);
+ static_call_update(amd_pstate_update_perf, shmem_update_perf);
}
if (amd_pstate_prefcore) {

View File

@ -0,0 +1,115 @@
From 787175146e26a199c06be4e6bf8cf8da0f757271 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 3 Oct 2024 08:39:52 +0000
Subject: cpufreq: Add a callback to update the min_freq_req from drivers
Currently, there is no proper way to update the initial lower frequency
limit from cpufreq drivers. Only way is to add a new min_freq qos
request from the driver side, but it leads to the issue explained below.
The QoS infrastructure collates the constraints from multiple
subsystems and saves them in a plist. The "current value" is defined to
be the highest value in the plist for min_freq constraint.
The cpufreq core adds a qos_request for min_freq to be 0 and the amd-pstate
driver today adds qos request for min_freq to be lowest_freq, where
lowest_freq corresponds to CPPC.lowest_perf.
Eg: Suppose WLOG considering amd-pstate driver, lowest_freq is 400000 KHz,
lowest_non_linear_freq is 1200000 KHz.
At this point of time, the min_freq QoS plist looks like:
head--> 400000 KHz (registered by amd-pstate) --> 0 KHz (registered by
cpufreq core)
When a user updates /sys/devices/system/cpu/cpuX/cpufreq/scaling_min_freq,
it only results in updating the cpufreq-core's node in the plist, where
say 0 becomes the newly echoed value.
Now, if the user echoes a value 1000000 KHz, to scaling_min_freq, then the
new list would be
head--> 1000000 KHz (registered by cpufreq core) --> 400000 KHz (registered
by amd-pstate)
and the new "current value" of the min_freq QoS constraint will be 1000000
KHz, this is the scenario where it works as expected.
Suppose we change the amd-pstate driver code's min_freq qos constraint
to lowest_non_linear_freq instead of lowest_freq, then the user will
never be able to request a value below that, due to the following:
At boot time, the min_freq QoS plist would be
head--> 1200000 KHz (registered by amd-pstate) --> 0 KHz (registered by
cpufreq core)
When the user echoes a value of 1000000 KHz, to
/sys/devices/..../scaling_min_freq, then the new list would be
head--> 1200000 KHz (registered by amd-pstate) --> 1000000 KHz (registered
by cpufreq core)
with the new "current value" of the min_freq QoS remaining 1200000 KHz.
Since the current value has not changed, there won't be any notifications
sent to the subsystems which have added their QoS constraints. In
particular, the amd-pstate driver will not get the notification, and thus,
the user's request to lower the scaling_min_freq will be ineffective.
Hence, it is advisable to have a single source of truth for the min and
max freq QoS constraints between the cpufreq and the cpufreq drivers.
So add a new callback get_init_min_freq() add in struct cpufreq_driver,
which allows amd-pstate (or any other cpufreq driver) to override the
default min_freq value being set in the policy->min_freq_req. Now
scaling_min_freq can be modified by the user to any value (lower or
higher than the init value) later on if desired.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/cpufreq.c | 6 +++++-
include/linux/cpufreq.h | 6 ++++++
2 files changed, 11 insertions(+), 1 deletion(-)
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1380,6 +1380,7 @@ static int cpufreq_online(unsigned int c
bool new_policy;
unsigned long flags;
unsigned int j;
+ u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE;
int ret;
pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
@@ -1464,9 +1465,12 @@ static int cpufreq_online(unsigned int c
goto out_destroy_policy;
}
+ if (cpufreq_driver->get_init_min_freq)
+ init_min_freq = cpufreq_driver->get_init_min_freq(policy);
+
ret = freq_qos_add_request(&policy->constraints,
policy->min_freq_req, FREQ_QOS_MIN,
- FREQ_QOS_MIN_DEFAULT_VALUE);
+ init_min_freq);
if (ret < 0) {
/*
* So we don't call freq_qos_remove_request() for an
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -414,6 +414,12 @@ struct cpufreq_driver {
* policy is properly initialized, but before the governor is started.
*/
void (*register_em)(struct cpufreq_policy *policy);
+
+ /*
+ * Set by drivers that want to initialize the policy->min_freq_req with
+ * a value different from the default value (0) in cpufreq core.
+ */
+ int (*get_init_min_freq)(struct cpufreq_policy *policy);
};
/* flags */

View File

@ -0,0 +1,79 @@
From f5b234be445a45b0bcacc37e0aad7a6bc7900eac Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 3 Oct 2024 08:39:54 +0000
Subject: cpufreq/amd-pstate: Set the initial min_freq to lowest_nonlinear_freq
According to the AMD architectural programmer's manual volume 2 [1], in
section "17.6.4.1 CPPC_CAPABILITY_1" lowest_nonlinear_perf is described
as "Reports the most energy efficient performance level (in terms of
performance per watt). Above this threshold, lower performance levels
generally result in increased energy efficiency. Reducing performance
below this threshold does not result in total energy savings for a given
computation, although it reduces instantaneous power consumption". So
lowest_nonlinear_perf is the most power efficient performance level, and
going below that would lead to a worse performance/watt.
Also, setting the minimum frequency to lowest_nonlinear_freq (instead of
lowest_freq) allows the CPU to idle at a higher frequency which leads
to more time being spent in a deeper idle state (as trivial idle tasks
are completed sooner). This has shown a power benefit in some systems,
in other systems, power consumption has increased but so has the
throughput/watt.
Use the get_init_min_freq() callback to set the initial lower limit for
amd-pstate driver to lowest_nonlinear_freq instead of lowest_freq.
Link: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/24593.pdf [1]
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1025,13 +1025,6 @@ static int amd_pstate_cpu_init(struct cp
if (cpu_feature_enabled(X86_FEATURE_CPPC))
policy->fast_switch_possible = true;
- ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
- FREQ_QOS_MIN, policy->cpuinfo.min_freq);
- if (ret < 0) {
- dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
- goto free_cpudata1;
- }
-
ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
if (ret < 0) {
@@ -1736,6 +1729,13 @@ static int amd_pstate_epp_resume(struct
return 0;
}
+static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy)
+{
+ struct amd_cpudata *cpudata = policy->driver_data;
+
+ return READ_ONCE(cpudata->lowest_nonlinear_freq);
+}
+
static struct cpufreq_driver amd_pstate_driver = {
.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
.verify = amd_pstate_verify,
@@ -1749,6 +1749,7 @@ static struct cpufreq_driver amd_pstate_
.update_limits = amd_pstate_update_limits,
.name = "amd-pstate",
.attr = amd_pstate_attr,
+ .get_init_min_freq = amd_pstate_get_init_min_freq,
};
static struct cpufreq_driver amd_pstate_epp_driver = {
@@ -1765,6 +1766,7 @@ static struct cpufreq_driver amd_pstate_
.set_boost = amd_pstate_set_boost,
.name = "amd-pstate-epp",
.attr = amd_pstate_epp_attr,
+ .get_init_min_freq = amd_pstate_get_init_min_freq,
};
static int __init amd_pstate_set_driver(int mode_idx)

View File

@ -0,0 +1,103 @@
From f7b2b3a1c0d015c4272793bed89734c5cffb354c Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 3 Oct 2024 08:39:56 +0000
Subject: cpufreq/amd-pstate: Cleanup the old min_freq qos request remnants
Convert the freq_qos_request array in struct amd_cpudata to a single
variable (only for max_freq request). Remove the references to cpudata->req
array. Remove and rename the jump labels accordingly.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 19 ++++++++-----------
drivers/cpufreq/amd-pstate.h | 4 ++--
2 files changed, 10 insertions(+), 13 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -726,7 +726,7 @@ static int amd_pstate_cpu_boost_update(s
policy->max = policy->cpuinfo.max_freq;
if (cppc_state == AMD_PSTATE_PASSIVE) {
- ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq);
+ ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq);
if (ret < 0)
pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
}
@@ -993,17 +993,17 @@ static int amd_pstate_cpu_init(struct cp
ret = amd_pstate_init_perf(cpudata);
if (ret)
- goto free_cpudata1;
+ goto free_cpudata;
amd_pstate_init_prefcore(cpudata);
ret = amd_pstate_init_freq(cpudata);
if (ret)
- goto free_cpudata1;
+ goto free_cpudata;
ret = amd_pstate_init_boost_support(cpudata);
if (ret)
- goto free_cpudata1;
+ goto free_cpudata;
min_freq = READ_ONCE(cpudata->min_freq);
max_freq = READ_ONCE(cpudata->max_freq);
@@ -1025,11 +1025,11 @@ static int amd_pstate_cpu_init(struct cp
if (cpu_feature_enabled(X86_FEATURE_CPPC))
policy->fast_switch_possible = true;
- ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
+ ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req,
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
if (ret < 0) {
dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
- goto free_cpudata2;
+ goto free_cpudata;
}
cpudata->max_limit_freq = max_freq;
@@ -1042,9 +1042,7 @@ static int amd_pstate_cpu_init(struct cp
return 0;
-free_cpudata2:
- freq_qos_remove_request(&cpudata->req[0]);
-free_cpudata1:
+free_cpudata:
kfree(cpudata);
return ret;
}
@@ -1053,8 +1051,7 @@ static void amd_pstate_cpu_exit(struct c
{
struct amd_cpudata *cpudata = policy->driver_data;
- freq_qos_remove_request(&cpudata->req[1]);
- freq_qos_remove_request(&cpudata->req[0]);
+ freq_qos_remove_request(&cpudata->max_freq_req);
policy->fast_switch_possible = false;
kfree(cpudata);
}
--- a/drivers/cpufreq/amd-pstate.h
+++ b/drivers/cpufreq/amd-pstate.h
@@ -28,7 +28,7 @@ struct amd_aperf_mperf {
/**
* struct amd_cpudata - private CPU data for AMD P-State
* @cpu: CPU number
- * @req: constraint request to apply
+ * @max_freq_req: maximum frequency constraint request to apply
* @cppc_req_cached: cached performance request hints
* @highest_perf: the maximum performance an individual processor may reach,
* assuming ideal conditions
@@ -68,7 +68,7 @@ struct amd_aperf_mperf {
struct amd_cpudata {
int cpu;
- struct freq_qos_request req[2];
+ struct freq_qos_request max_freq_req;
u64 cppc_req_cached;
u32 highest_perf;

View File

@ -0,0 +1,42 @@
From d1216c052bedbf6d79e4b0261e2f09e17c66ffd3 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 4 Oct 2024 12:23:04 +0000
Subject: cpufreq/amd-pstate: Fix amd_pstate mode switch on shared memory
systems
While switching the driver mode between active and passive, Collaborative
Processor Performance Control (CPPC) is disabled in
amd_pstate_unregister_driver(). But, it is not enabled back while registering
the new driver (passive or active). This leads to the new driver mode not
working correctly, so enable it back in amd_pstate_register_driver().
Fixes: 3ca7bc818d8c ("cpufreq: amd-pstate: Add guided mode control support via sysfs")
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
drivers/cpufreq/amd-pstate.c | 10 ++++++++++
1 file changed, 10 insertions(+)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1221,11 +1221,21 @@ static int amd_pstate_register_driver(in
return -EINVAL;
cppc_state = mode;
+
+ ret = amd_pstate_enable(true);
+ if (ret) {
+ pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n",
+ ret);
+ amd_pstate_driver_cleanup();
+ return ret;
+ }
+
ret = cpufreq_register_driver(current_pstate_driver);
if (ret) {
amd_pstate_driver_cleanup();
return ret;
}
+
return 0;
}

View File

@ -0,0 +1,57 @@
From c4fde0d177bdb33912f450914d84d6432391a8b5 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 12 Oct 2024 12:45:16 -0500
Subject: cpufreq/amd-pstate: Use nominal perf for limits when boost is
disabled
When boost has been disabled the limit for perf should be nominal perf not
the highest perf. Using the latter to do calculations will lead to
incorrect values that are still above nominal.
Fixes: ad4caad58d91 ("cpufreq: amd-pstate: Merge amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()")
Reported-by: Peter Jung <ptr1337@cachyos.org>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219348
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -566,11 +566,16 @@ static int amd_pstate_verify(struct cpuf
static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy)
{
- u32 max_limit_perf, min_limit_perf, lowest_perf;
+ u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf;
struct amd_cpudata *cpudata = policy->driver_data;
- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
+ if (cpudata->boost_supported && !policy->boost_enabled)
+ max_perf = READ_ONCE(cpudata->nominal_perf);
+ else
+ max_perf = READ_ONCE(cpudata->highest_perf);
+
+ max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
+ min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
lowest_perf = READ_ONCE(cpudata->lowest_perf);
if (min_limit_perf < lowest_perf)
@@ -1526,10 +1531,13 @@ static int amd_pstate_epp_update_limit(s
u64 value;
s16 epp;
- max_perf = READ_ONCE(cpudata->highest_perf);
+ if (cpudata->boost_supported && !policy->boost_enabled)
+ max_perf = READ_ONCE(cpudata->nominal_perf);
+ else
+ max_perf = READ_ONCE(cpudata->highest_perf);
min_perf = READ_ONCE(cpudata->lowest_perf);
- max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq);
- min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq);
+ max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
+ min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
if (min_limit_perf < min_perf)
min_limit_perf = min_perf;

View File

@ -0,0 +1,55 @@
From 01ad0fb3da95867947d923596a26b18d844afe3c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 12 Oct 2024 12:45:17 -0500
Subject: cpufreq/amd-pstate: Don't update CPPC request in
amd_pstate_cpu_boost_update()
When boost is changed the CPPC value is changed in amd_pstate_cpu_boost_update()
but then changed again when refresh_frequency_limits() and all it's callbacks
occur. The first is a pointless write, so instead just update the limits for
the policy and let the policy refresh anchor everything properly.
Fixes: c8c68c38b56f ("cpufreq: amd-pstate: initialize core precision boost state")
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 24 +-----------------------
1 file changed, 1 insertion(+), 23 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -695,34 +695,12 @@ static void amd_pstate_adjust_perf(unsig
static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on)
{
struct amd_cpudata *cpudata = policy->driver_data;
- struct cppc_perf_ctrls perf_ctrls;
- u32 highest_perf, nominal_perf, nominal_freq, max_freq;
+ u32 nominal_freq, max_freq;
int ret = 0;
- highest_perf = READ_ONCE(cpudata->highest_perf);
- nominal_perf = READ_ONCE(cpudata->nominal_perf);
nominal_freq = READ_ONCE(cpudata->nominal_freq);
max_freq = READ_ONCE(cpudata->max_freq);
- if (boot_cpu_has(X86_FEATURE_CPPC)) {
- u64 value = READ_ONCE(cpudata->cppc_req_cached);
-
- value &= ~GENMASK_ULL(7, 0);
- value |= on ? highest_perf : nominal_perf;
- WRITE_ONCE(cpudata->cppc_req_cached, value);
-
- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
- } else {
- perf_ctrls.max_perf = on ? highest_perf : nominal_perf;
- ret = cppc_set_perf(cpudata->cpu, &perf_ctrls);
- if (ret) {
- cpufreq_cpu_release(policy);
- pr_debug("Failed to set max perf on CPU:%d. ret:%d\n",
- cpudata->cpu, ret);
- return ret;
- }
- }
-
if (on)
policy->cpuinfo.max_freq = max_freq;
else if (policy->cpuinfo.max_freq > nominal_freq * 1000)

View File

@ -0,0 +1,49 @@
From 684d162c08ab86fff02861c907ecc92bf9c09af4 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 12 Oct 2024 12:45:18 -0500
Subject: cpufreq/amd-pstate: Use amd_pstate_update_min_max_limit() for EPP
limits
When the EPP updates are set the maximum capable frequency for the
CPU is used to set the upper limit instead of that of the policy.
Adjust amd_pstate_epp_update_limit() to reuse policy calculation code
from amd_pstate_update_min_max_limit().
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 19 +++----------------
1 file changed, 3 insertions(+), 16 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1505,26 +1505,13 @@ static void amd_pstate_epp_cpu_exit(stru
static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
- u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
+ u32 max_perf, min_perf;
u64 value;
s16 epp;
- if (cpudata->boost_supported && !policy->boost_enabled)
- max_perf = READ_ONCE(cpudata->nominal_perf);
- else
- max_perf = READ_ONCE(cpudata->highest_perf);
+ max_perf = READ_ONCE(cpudata->highest_perf);
min_perf = READ_ONCE(cpudata->lowest_perf);
- max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
- min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
-
- if (min_limit_perf < min_perf)
- min_limit_perf = min_perf;
-
- if (max_limit_perf < min_limit_perf)
- max_limit_perf = min_limit_perf;
-
- WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
- WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
+ amd_pstate_update_min_max_limit(policy);
max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
cpudata->max_limit_perf);

View File

@ -0,0 +1,29 @@
From fa46d2873c9fa4060ce407e4bc5c7e29babce9d0 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 12 Oct 2024 12:45:19 -0500
Subject: cpufreq/amd-pstate: Drop needless EPP initialization
The EPP value doesn't need to be cached to the CPPC request in
amd_pstate_epp_update_limit() because it's passed as an argument
at the end to amd_pstate_set_epp() and stored at that time.
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 6 ------
1 file changed, 6 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1548,12 +1548,6 @@ static int amd_pstate_epp_update_limit(s
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
epp = 0;
- /* Set initial EPP value */
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
- value &= ~GENMASK_ULL(31, 24);
- value |= (u64)epp << 24;
- }
-
WRITE_ONCE(cpudata->cppc_req_cached, value);
return amd_pstate_set_epp(cpudata, epp);
}

View File

@ -0,0 +1,228 @@
From 649d296be0c7f0df6e71b4fca25fdbe75cb3994e Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@natalenko.name>
Date: Thu, 17 Oct 2024 17:03:11 +0200
Subject: amd-pstate-6.11: update setting the minimum frequency to
lowest_nonlinear_freq patchset to v3
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
---
drivers/cpufreq/amd-pstate.c | 67 +++++++++++++++++++++---------------
drivers/cpufreq/amd-pstate.h | 4 +--
drivers/cpufreq/cpufreq.c | 6 +---
include/linux/cpufreq.h | 6 ----
4 files changed, 43 insertions(+), 40 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -557,9 +557,28 @@ cpufreq_policy_put:
cpufreq_cpu_put(policy);
}
-static int amd_pstate_verify(struct cpufreq_policy_data *policy)
+static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
{
- cpufreq_verify_within_cpu_limits(policy);
+ /*
+ * Initialize lower frequency limit (i.e.policy->min) with
+ * lowest_nonlinear_frequency which is the most energy efficient
+ * frequency. Override the initial value set by cpufreq core and
+ * amd-pstate qos_requests.
+ */
+ if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
+ struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu);
+ struct amd_cpudata *cpudata;
+
+ if (!policy)
+ return -EINVAL;
+
+ cpudata = policy->driver_data;
+ policy_data->min = cpudata->lowest_nonlinear_freq;
+ cpufreq_cpu_put(policy);
+ }
+
+ cpufreq_verify_within_cpu_limits(policy_data);
+ pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min);
return 0;
}
@@ -709,7 +728,7 @@ static int amd_pstate_cpu_boost_update(s
policy->max = policy->cpuinfo.max_freq;
if (cppc_state == AMD_PSTATE_PASSIVE) {
- ret = freq_qos_update_request(&cpudata->max_freq_req, policy->cpuinfo.max_freq);
+ ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq);
if (ret < 0)
pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
}
@@ -976,17 +995,17 @@ static int amd_pstate_cpu_init(struct cp
ret = amd_pstate_init_perf(cpudata);
if (ret)
- goto free_cpudata;
+ goto free_cpudata1;
amd_pstate_init_prefcore(cpudata);
ret = amd_pstate_init_freq(cpudata);
if (ret)
- goto free_cpudata;
+ goto free_cpudata1;
ret = amd_pstate_init_boost_support(cpudata);
if (ret)
- goto free_cpudata;
+ goto free_cpudata1;
min_freq = READ_ONCE(cpudata->min_freq);
max_freq = READ_ONCE(cpudata->max_freq);
@@ -1008,11 +1027,18 @@ static int amd_pstate_cpu_init(struct cp
if (cpu_feature_enabled(X86_FEATURE_CPPC))
policy->fast_switch_possible = true;
- ret = freq_qos_add_request(&policy->constraints, &cpudata->max_freq_req,
+ ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
+ FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE);
+ if (ret < 0) {
+ dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
+ goto free_cpudata1;
+ }
+
+ ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1],
FREQ_QOS_MAX, policy->cpuinfo.max_freq);
if (ret < 0) {
dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
- goto free_cpudata;
+ goto free_cpudata2;
}
cpudata->max_limit_freq = max_freq;
@@ -1025,7 +1051,9 @@ static int amd_pstate_cpu_init(struct cp
return 0;
-free_cpudata:
+free_cpudata2:
+ freq_qos_remove_request(&cpudata->req[0]);
+free_cpudata1:
kfree(cpudata);
return ret;
}
@@ -1034,7 +1062,8 @@ static void amd_pstate_cpu_exit(struct c
{
struct amd_cpudata *cpudata = policy->driver_data;
- freq_qos_remove_request(&cpudata->max_freq_req);
+ freq_qos_remove_request(&cpudata->req[1]);
+ freq_qos_remove_request(&cpudata->req[0]);
policy->fast_switch_possible = false;
kfree(cpudata);
}
@@ -1658,13 +1687,6 @@ static int amd_pstate_epp_cpu_offline(st
return 0;
}
-static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
-{
- cpufreq_verify_within_cpu_limits(policy);
- pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
- return 0;
-}
-
static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
@@ -1703,13 +1725,6 @@ static int amd_pstate_epp_resume(struct
return 0;
}
-static int amd_pstate_get_init_min_freq(struct cpufreq_policy *policy)
-{
- struct amd_cpudata *cpudata = policy->driver_data;
-
- return READ_ONCE(cpudata->lowest_nonlinear_freq);
-}
-
static struct cpufreq_driver amd_pstate_driver = {
.flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS,
.verify = amd_pstate_verify,
@@ -1723,12 +1738,11 @@ static struct cpufreq_driver amd_pstate_
.update_limits = amd_pstate_update_limits,
.name = "amd-pstate",
.attr = amd_pstate_attr,
- .get_init_min_freq = amd_pstate_get_init_min_freq,
};
static struct cpufreq_driver amd_pstate_epp_driver = {
.flags = CPUFREQ_CONST_LOOPS,
- .verify = amd_pstate_epp_verify_policy,
+ .verify = amd_pstate_verify,
.setpolicy = amd_pstate_epp_set_policy,
.init = amd_pstate_epp_cpu_init,
.exit = amd_pstate_epp_cpu_exit,
@@ -1740,7 +1754,6 @@ static struct cpufreq_driver amd_pstate_
.set_boost = amd_pstate_set_boost,
.name = "amd-pstate-epp",
.attr = amd_pstate_epp_attr,
- .get_init_min_freq = amd_pstate_get_init_min_freq,
};
static int __init amd_pstate_set_driver(int mode_idx)
--- a/drivers/cpufreq/amd-pstate.h
+++ b/drivers/cpufreq/amd-pstate.h
@@ -28,7 +28,7 @@ struct amd_aperf_mperf {
/**
* struct amd_cpudata - private CPU data for AMD P-State
* @cpu: CPU number
- * @max_freq_req: maximum frequency constraint request to apply
+ * @req: constraint request to apply
* @cppc_req_cached: cached performance request hints
* @highest_perf: the maximum performance an individual processor may reach,
* assuming ideal conditions
@@ -68,7 +68,7 @@ struct amd_aperf_mperf {
struct amd_cpudata {
int cpu;
- struct freq_qos_request max_freq_req;
+ struct freq_qos_request req[2];
u64 cppc_req_cached;
u32 highest_perf;
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1380,7 +1380,6 @@ static int cpufreq_online(unsigned int c
bool new_policy;
unsigned long flags;
unsigned int j;
- u32 init_min_freq = FREQ_QOS_MIN_DEFAULT_VALUE;
int ret;
pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
@@ -1465,12 +1464,9 @@ static int cpufreq_online(unsigned int c
goto out_destroy_policy;
}
- if (cpufreq_driver->get_init_min_freq)
- init_min_freq = cpufreq_driver->get_init_min_freq(policy);
-
ret = freq_qos_add_request(&policy->constraints,
policy->min_freq_req, FREQ_QOS_MIN,
- init_min_freq);
+ FREQ_QOS_MIN_DEFAULT_VALUE);
if (ret < 0) {
/*
* So we don't call freq_qos_remove_request() for an
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -414,12 +414,6 @@ struct cpufreq_driver {
* policy is properly initialized, but before the governor is started.
*/
void (*register_em)(struct cpufreq_policy *policy);
-
- /*
- * Set by drivers that want to initialize the policy->min_freq_req with
- * a value different from the default value (0) in cpufreq core.
- */
- int (*get_init_min_freq)(struct cpufreq_policy *policy);
};
/* flags */

View File

@ -0,0 +1,44 @@
From db147a0a6341822a15fd9c4cd51f8dc4a9a1747b Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 17 Oct 2024 10:05:27 +0000
Subject: cpufreq/amd-pstate: Call amd_pstate_register() in amd_pstate_init()
Replace a similar chunk of code in amd_pstate_init() with
amd_pstate_register() call.
Suggested-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
drivers/cpufreq/amd-pstate.c | 12 ++----------
1 file changed, 2 insertions(+), 10 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1909,17 +1909,10 @@ static int __init amd_pstate_init(void)
return ret;
}
- /* enable amd pstate feature */
- ret = amd_pstate_enable(true);
- if (ret) {
- pr_err("failed to enable driver mode(%d)\n", cppc_state);
- return ret;
- }
-
- ret = cpufreq_register_driver(current_pstate_driver);
+ ret = amd_pstate_register_driver(cppc_state);
if (ret) {
pr_err("failed to register with return %d\n", ret);
- goto disable_driver;
+ return ret;
}
dev_root = bus_get_dev_root(&cpu_subsys);
@@ -1936,7 +1929,6 @@ static int __init amd_pstate_init(void)
global_attr_free:
cpufreq_unregister_driver(current_pstate_driver);
-disable_driver:
amd_pstate_enable(false);
return ret;
}

View File

@ -0,0 +1,81 @@
From 7c658490b05f6ab4dd59e1c25e75ba1037f6cfeb Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 17 Oct 2024 10:05:29 +0000
Subject: cpufreq/amd-pstate: Call amd_pstate_set_driver() in
amd_pstate_register_driver()
Replace a similar chunk of code in amd_pstate_register_driver() with
amd_pstate_set_driver() call.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
drivers/cpufreq/amd-pstate.c | 47 +++++++++++++++++-------------------
1 file changed, 22 insertions(+), 25 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1221,16 +1221,32 @@ static void amd_pstate_driver_cleanup(vo
current_pstate_driver = NULL;
}
+static int amd_pstate_set_driver(int mode_idx)
+{
+ if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
+ cppc_state = mode_idx;
+ if (cppc_state == AMD_PSTATE_DISABLE)
+ pr_info("driver is explicitly disabled\n");
+
+ if (cppc_state == AMD_PSTATE_ACTIVE)
+ current_pstate_driver = &amd_pstate_epp_driver;
+
+ if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
+ current_pstate_driver = &amd_pstate_driver;
+
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
static int amd_pstate_register_driver(int mode)
{
int ret;
- if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED)
- current_pstate_driver = &amd_pstate_driver;
- else if (mode == AMD_PSTATE_ACTIVE)
- current_pstate_driver = &amd_pstate_epp_driver;
- else
- return -EINVAL;
+ ret = amd_pstate_set_driver(mode);
+ if (ret)
+ return ret;
cppc_state = mode;
@@ -1756,25 +1772,6 @@ static struct cpufreq_driver amd_pstate_
.attr = amd_pstate_epp_attr,
};
-static int __init amd_pstate_set_driver(int mode_idx)
-{
- if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
- cppc_state = mode_idx;
- if (cppc_state == AMD_PSTATE_DISABLE)
- pr_info("driver is explicitly disabled\n");
-
- if (cppc_state == AMD_PSTATE_ACTIVE)
- current_pstate_driver = &amd_pstate_epp_driver;
-
- if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
- current_pstate_driver = &amd_pstate_driver;
-
- return 0;
- }
-
- return -EINVAL;
-}
-
/*
* CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F.
* show the debug message that helps to check if the CPU has CPPC support for loading issue.

View File

@ -0,0 +1,41 @@
From 55be5db97f4f52badc958463ee8d9cbc2ae91615 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 17 Oct 2024 10:05:31 +0000
Subject: cpufreq/amd-pstate: Remove the switch case in amd_pstate_init()
Replace the switch case with a more readable if condition.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
drivers/cpufreq/amd-pstate.c | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1873,21 +1873,15 @@ static int __init amd_pstate_init(void)
cppc_state = CONFIG_X86_AMD_PSTATE_DEFAULT_MODE;
}
- switch (cppc_state) {
- case AMD_PSTATE_DISABLE:
+ if (cppc_state == AMD_PSTATE_DISABLE) {
pr_info("driver load is disabled, boot with specific mode to enable this\n");
return -ENODEV;
- case AMD_PSTATE_PASSIVE:
- case AMD_PSTATE_ACTIVE:
- case AMD_PSTATE_GUIDED:
- ret = amd_pstate_set_driver(cppc_state);
- if (ret)
- return ret;
- break;
- default:
- return -EINVAL;
}
+ ret = amd_pstate_set_driver(cppc_state);
+ if (ret)
+ return ret;
+
/* capability check */
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
pr_debug("AMD CPPC MSR based functionality is supported\n");

View File

@ -0,0 +1,43 @@
From 7305364888151cb9e6b435c5f219ccfd18132b58 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Thu, 17 Oct 2024 10:05:33 +0000
Subject: cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call
amd_pstate_set_driver() is called twice, once in amd_pstate_init() and once
as part of amd_pstate_register_driver(). Move around code and eliminate
the redundancy.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
drivers/cpufreq/amd-pstate.c | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1878,9 +1878,11 @@ static int __init amd_pstate_init(void)
return -ENODEV;
}
- ret = amd_pstate_set_driver(cppc_state);
- if (ret)
+ ret = amd_pstate_register_driver(cppc_state);
+ if (ret) {
+ pr_err("failed to register with return %d\n", ret);
return ret;
+ }
/* capability check */
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
@@ -1900,12 +1902,6 @@ static int __init amd_pstate_init(void)
return ret;
}
- ret = amd_pstate_register_driver(cppc_state);
- if (ret) {
- pr_err("failed to register with return %d\n", ret);
- return ret;
- }
-
dev_root = bus_get_dev_root(&cpu_subsys);
if (dev_root) {
ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group);

View File

@ -0,0 +1,33 @@
From 5886ef269d069c72ea952cb00699e16221289e8c Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 17 Oct 2024 12:34:39 -0500
Subject: cpufreq/amd-pstate-ut: Add fix for min freq unit test
commit 642aff3964b0f ("cpufreq/amd-pstate: Set the initial min_freq to
lowest_nonlinear_freq") changed the iniital minimum frequency to lowest
nonlinear frequency, but the unit tests weren't updated and now fail.
Update them to match this same change.
Fixes: 642aff3964b0f ("cpufreq/amd-pstate: Set the initial min_freq to lowest_nonlinear_freq")
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate-ut.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -227,10 +227,10 @@ static void amd_pstate_ut_check_freq(u32
goto skip_test;
}
- if (cpudata->min_freq != policy->min) {
+ if (cpudata->lowest_nonlinear_freq != policy->min) {
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
- pr_err("%s cpu%d cpudata_min_freq=%d policy_min=%d, they should be equal!\n",
- __func__, cpu, cpudata->min_freq, policy->min);
+ pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n",
+ __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min);
goto skip_test;
}

View File

@ -0,0 +1,33 @@
From e82b9b5a56bcac18cae68878fe67263279805735 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <gautham.shenoy@amd.com>
Date: Mon, 21 Oct 2024 15:48:35 +0530
Subject: amd-pstate: Set min_perf to nominal_perf for active mode performance
gov
The amd-pstate driver sets CPPC_REQ.min_perf to CPPC_REQ.max_perf when
in active mode with performance governor. Typically CPPC_REQ.max_perf
is set to CPPC.highest_perf. This causes frequency throttling on
power-limited platforms which causes performance regressions on
certain classes of workloads.
Hence, set the CPPC_REQ.min_perf to the CPPC.nominal_perf or
CPPC_REQ.max_perf, whichever is lower of the two.
Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors")
Signed-off-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1565,7 +1565,7 @@ static int amd_pstate_epp_update_limit(s
value = READ_ONCE(cpudata->cppc_req_cached);
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
- min_perf = max_perf;
+ min_perf = min(cpudata->nominal_perf, max_perf);
/* Initial min/max values for CPPC Performance Controls Register */
value &= ~AMD_CPPC_MIN_PERF(~0L);

View File

@ -0,0 +1,44 @@
From 497447cf96a785a4edd0756da5d5718037f5687c Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 21 Oct 2024 15:48:36 +0530
Subject: amd-pstate: Switch to amd-pstate by default on some Server platforms
Currently the default cpufreq driver for all the AMD EPYC servers is
acpi-cpufreq. Going forward, switch to amd-pstate as the default
driver on the AMD EPYC server platforms with CPU family 0x1A or
higher. The default mode will be active mode.
Testing shows that amd-pstate with active mode and performance
governor provides comparable or better performance per-watt against
acpi-cpufreq + performance governor.
Likewise, amd-pstate with active mode and powersave governor with the
energy_performance_preference=power (EPP=255) provides comparable or
better performance per-watt against acpi-cpufreq + schedutil governor
for a wide range of workloads.
Users can still revert to using acpi-cpufreq driver on these platforms
with the "amd_pstate=disable" kernel commandline parameter.
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Signed-off-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1862,10 +1862,10 @@ static int __init amd_pstate_init(void)
if (cppc_state == AMD_PSTATE_UNDEFINED) {
/* Disable on the following configs by default:
* 1. Undefined platforms
- * 2. Server platforms
+ * 2. Server platforms with CPUs older than Family 0x1A.
*/
if (amd_pstate_acpi_pm_profile_undefined() ||
- amd_pstate_acpi_pm_profile_server()) {
+ (amd_pstate_acpi_pm_profile_server() && boot_cpu_data.x86 < 0x1A)) {
pr_info("driver load is disabled, boot with specific mode to enable this\n");
return -ENODEV;
}

View File

@ -0,0 +1,38 @@
From a4d255935a1ea6e4b10167df942ec641079bcdf7 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 28 Oct 2024 09:55:41 -0500
Subject: cpufreq/amd-pstate: Push adjust_perf vfunc init into cpu_init
As the driver can be changed in and out of different modes it's possible
that adjust_perf is assigned when it shouldn't be.
This could happen if an MSR design is started up in passive mode and then
switches to active mode.
To solve this explicitly clear `adjust_perf` in amd_pstate_epp_cpu_init().
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1528,6 +1528,8 @@ static int amd_pstate_epp_cpu_init(struc
WRITE_ONCE(cpudata->cppc_cap1_cached, value);
}
+ current_pstate_driver->adjust_perf = NULL;
+
return 0;
free_cpudata1:
@@ -1887,8 +1889,6 @@ static int __init amd_pstate_init(void)
/* capability check */
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
pr_debug("AMD CPPC MSR based functionality is supported\n");
- if (cppc_state != AMD_PSTATE_ACTIVE)
- current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
} else {
pr_debug("AMD CPPC shared memory based functionality is supported\n");
static_call_update(amd_pstate_enable, shmem_enable);

View File

@ -0,0 +1,47 @@
From c42a82a583646dcbba8500d47ed878616ab5c33a Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 28 Oct 2024 09:55:42 -0500
Subject: cpufreq/amd-pstate: Move registration after static function call
update
On shared memory designs the static functions need to work before
registration is done or the system can hang at bootup.
Move the registration later in amd_pstate_init() to solve this.
Fixes: e238968a2087 ("cpufreq/amd-pstate: Remove the redundant amd_pstate_set_driver() call")
Reported-by: Klara Modin <klarasmodin@gmail.com>
Closes: https://lore.kernel.org/linux-pm/cf9c146d-bacf-444e-92e2-15ebf513af96@gmail.com/#t
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
drivers/cpufreq/amd-pstate.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1880,12 +1880,6 @@ static int __init amd_pstate_init(void)
return -ENODEV;
}
- ret = amd_pstate_register_driver(cppc_state);
- if (ret) {
- pr_err("failed to register with return %d\n", ret);
- return ret;
- }
-
/* capability check */
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
pr_debug("AMD CPPC MSR based functionality is supported\n");
@@ -1896,6 +1890,12 @@ static int __init amd_pstate_init(void)
static_call_update(amd_pstate_update_perf, shmem_update_perf);
}
+ ret = amd_pstate_register_driver(cppc_state);
+ if (ret) {
+ pr_err("failed to register with return %d\n", ret);
+ return ret;
+ }
+
if (amd_pstate_prefcore) {
ret = amd_detect_prefcore(&amd_pstate_prefcore);
if (ret)

View File

@ -0,0 +1,321 @@
From 023d6b8aa8d8b346cfdcccf5ca4cb880c8d41d87 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:37 -0700
Subject: perf: Generic hotplug support for a PMU with a scope
The perf subsystem assumes that the counters of a PMU are per-CPU. So
the user space tool reads a counter from each CPU in the system wide
mode. However, many PMUs don't have a per-CPU counter. The counter is
effective for a scope, e.g., a die or a socket. To address this, a
cpumask is exposed by the kernel driver to restrict to one CPU to stand
for a specific scope. In case the given CPU is removed,
the hotplug support has to be implemented for each such driver.
The codes to support the cpumask and hotplug are very similar.
- Expose a cpumask into sysfs
- Pickup another CPU in the same scope if the given CPU is removed.
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
- In event init, always set the CPU in the cpumask to event->cpu
Similar duplicated codes are implemented for each such PMU driver. It
would be good to introduce a generic infrastructure to avoid such
duplication.
5 popular scopes are implemented here, core, die, cluster, pkg, and
the system-wide. The scope can be set when a PMU is registered. If so, a
"cpumask" is automatically exposed for the PMU.
The "cpumask" is from the perf_online_<scope>_mask, which is to track
the active CPU for each scope. They are set when the first CPU of the
scope is online via the generic perf hotplug support. When a
corresponding CPU is removed, the perf_online_<scope>_mask is updated
accordingly and the PMU will be moved to a new CPU from the same scope
if possible.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
include/linux/perf_event.h | 18 ++++
kernel/events/core.c | 164 ++++++++++++++++++++++++++++++++++++-
2 files changed, 180 insertions(+), 2 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -292,6 +292,19 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
+/**
+ * pmu::scope
+ */
+enum perf_pmu_scope {
+ PERF_PMU_SCOPE_NONE = 0,
+ PERF_PMU_SCOPE_CORE,
+ PERF_PMU_SCOPE_DIE,
+ PERF_PMU_SCOPE_CLUSTER,
+ PERF_PMU_SCOPE_PKG,
+ PERF_PMU_SCOPE_SYS_WIDE,
+ PERF_PMU_MAX_SCOPE,
+};
+
struct perf_output_handle;
#define PMU_NULL_DEV ((void *)(~0UL))
@@ -315,6 +328,11 @@ struct pmu {
*/
int capabilities;
+ /*
+ * PMU scope
+ */
+ unsigned int scope;
+
int __percpu *pmu_disable_count;
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -411,6 +411,11 @@ static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
/*
@@ -11497,10 +11502,60 @@ perf_event_mux_interval_ms_store(struct
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return topology_sibling_cpumask(cpu);
+ case PERF_PMU_SCOPE_DIE:
+ return topology_die_cpumask(cpu);
+ case PERF_PMU_SCOPE_CLUSTER:
+ return topology_cluster_cpumask(cpu);
+ case PERF_PMU_SCOPE_PKG:
+ return topology_core_cpumask(cpu);
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return cpu_online_mask;
+ }
+
+ return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return perf_online_core_mask;
+ case PERF_PMU_SCOPE_DIE:
+ return perf_online_die_mask;
+ case PERF_PMU_SCOPE_CLUSTER:
+ return perf_online_cluster_mask;
+ case PERF_PMU_SCOPE_PKG:
+ return perf_online_pkg_mask;
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return perf_online_sys_mask;
+ }
+
+ return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+ if (mask)
+ return cpumap_print_to_pagebuf(true, buf, mask);
+ return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
&dev_attr_nr_addr_filters.attr,
+ &dev_attr_cpumask.attr,
NULL,
};
@@ -11512,6 +11567,10 @@ static umode_t pmu_dev_is_visible(struct
if (n == 2 && !pmu->nr_addr_filters)
return 0;
+ /* cpumask */
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+ return 0;
+
return a->mode;
}
@@ -11596,6 +11655,11 @@ int perf_pmu_register(struct pmu *pmu, c
goto free_pdc;
}
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;
if (type >= 0)
@@ -11750,6 +11814,22 @@ static int perf_try_init_event(struct pm
event_has_any_exclude_flag(event))
ret = -EINVAL;
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
+ int cpu;
+
+ if (pmu_cpumask && cpumask) {
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ ret = -ENODEV;
+ else
+ event->cpu = cpu;
+ } else {
+ ret = -ENODEV;
+ }
+ }
+
if (ret && event->destroy)
event->destroy(event);
}
@@ -13713,6 +13793,12 @@ static void __init perf_event_init_all_c
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
@@ -13762,6 +13848,40 @@ static void __perf_event_exit_context(vo
raw_spin_unlock(&ctx->lock);
}
+static void perf_event_clear_cpumask(unsigned int cpu)
+{
+ int target[PERF_PMU_MAX_SCOPE];
+ unsigned int scope;
+ struct pmu *pmu;
+
+ cpumask_clear_cpu(cpu, perf_online_mask);
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+ target[scope] = -1;
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+ continue;
+ target[scope] = cpumask_any_but(cpumask, cpu);
+ if (target[scope] < nr_cpu_ids)
+ cpumask_set_cpu(target[scope], pmu_cpumask);
+ }
+
+ /* migrate */
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+ continue;
+
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+ }
+}
+
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13769,6 +13889,11 @@ static void perf_event_exit_cpu_context(
// XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
+ /*
+ * Clear the cpumasks, and migrate to other CPUs if possible.
+ * Must be invoked before the __perf_event_exit_context.
+ */
+ perf_event_clear_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -13776,7 +13901,6 @@ static void perf_event_exit_cpu_context(
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
- cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
#else
@@ -13785,6 +13909,42 @@ static void perf_event_exit_cpu_context(
#endif
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+ struct cpumask *pmu_cpumask;
+ unsigned int scope;
+
+ cpumask_set_cpu(cpu, perf_online_mask);
+
+ /*
+ * Early boot stage, the cpumask hasn't been set yet.
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
+ * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
+ */
+ if (!topology_sibling_cpumask(cpu)) {
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ pmu_cpumask = perf_scope_cpumask(scope);
+ if (WARN_ON_ONCE(!pmu_cpumask))
+ continue;
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+ return;
+ }
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+ pmu_cpumask = perf_scope_cpumask(scope);
+
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_empty(cpumask) &&
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+}
+
int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13793,7 +13953,7 @@ int perf_event_init_cpu(unsigned int cpu
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
- cpumask_set_cpu(cpu, perf_online_mask);
+ perf_event_setup_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;

View File

@ -0,0 +1,71 @@
From 8c7eb17e722a6a45c4436e5debb9336089b21d9b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:38 -0700
Subject: perf: Add PERF_EV_CAP_READ_SCOPE
Usually, an event can be read from any CPU of the scope. It doesn't need
to be read from the advertised CPU.
Add a new event cap, PERF_EV_CAP_READ_SCOPE. An event of a PMU with
scope can be read from any active CPU in the scope.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
include/linux/perf_event.h | 3 +++
kernel/events/core.c | 14 +++++++++++---
2 files changed, 14 insertions(+), 3 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -633,10 +633,13 @@ typedef void (*perf_overflow_handler_t)(
* PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
* cannot be a group leader. If an event with this flag is detached from the
* group it is scheduled out and moved into an unrecoverable ERROR state.
+ * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
+ * PMU scope where it is active.
*/
#define PERF_EV_CAP_SOFTWARE BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1)
#define PERF_EV_CAP_SIBLING BIT(2)
+#define PERF_EV_CAP_READ_SCOPE BIT(3)
#define SWEVENT_HLIST_BITS 8
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4477,16 +4477,24 @@ struct perf_read_data {
int ret;
};
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
+
static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
+ int local_cpu = smp_processor_id();
u16 local_pkg, event_pkg;
if ((unsigned)event_cpu >= nr_cpu_ids)
return event_cpu;
- if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
- int local_cpu = smp_processor_id();
+ if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
+
+ if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
+ return local_cpu;
+ }
+ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
event_pkg = topology_physical_package_id(event_cpu);
local_pkg = topology_physical_package_id(local_cpu);
@@ -11824,7 +11832,7 @@ static int perf_try_init_event(struct pm
if (cpu >= nr_cpu_ids)
ret = -ENODEV;
else
- event->cpu = cpu;
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
} else {
ret = -ENODEV;
}

View File

@ -0,0 +1,286 @@
From 09c1529eb102b486220c35546f2663ca858a2943 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:39 -0700
Subject: perf/x86/intel/cstate: Clean up cpumask and hotplug
There are three cstate PMUs with different scopes, core, die and module.
The scopes are supported by the generic perf_event subsystem now.
Set the scope for each PMU and remove all the cpumask and hotplug codes.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
arch/x86/events/intel/cstate.c | 142 ++-------------------------------
include/linux/cpuhotplug.h | 2 -
2 files changed, 5 insertions(+), 139 deletions(-)
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -128,10 +128,6 @@ static ssize_t __cstate_##_var##_show(st
static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
- struct device_attribute *attr,
- char *buf);
-
/* Model -> events mapping */
struct cstate_model {
unsigned long core_events;
@@ -206,22 +202,9 @@ static struct attribute_group cstate_for
.attrs = cstate_format_attrs,
};
-static cpumask_t cstate_core_cpu_mask;
-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
-
-static struct attribute *cstate_cpumask_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
- .attrs = cstate_cpumask_attrs,
-};
-
static const struct attribute_group *cstate_attr_groups[] = {
&cstate_events_attr_group,
&cstate_format_attr_group,
- &cpumask_attr_group,
NULL,
};
@@ -269,8 +252,6 @@ static struct perf_msr pkg_msr[] = {
[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr },
};
-static cpumask_t cstate_pkg_cpu_mask;
-
/* cstate_module PMU */
static struct pmu cstate_module_pmu;
static bool has_cstate_module;
@@ -291,28 +272,9 @@ static struct perf_msr module_msr[] = {
[PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr },
};
-static cpumask_t cstate_module_cpu_mask;
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- struct pmu *pmu = dev_get_drvdata(dev);
-
- if (pmu == &cstate_core_pmu)
- return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
- else if (pmu == &cstate_pkg_pmu)
- return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
- else if (pmu == &cstate_module_pmu)
- return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask);
- else
- return 0;
-}
-
static int cstate_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config;
- int cpu;
if (event->attr.type != event->pmu->type)
return -ENOENT;
@@ -331,20 +293,13 @@ static int cstate_pmu_event_init(struct
if (!(core_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = core_msr[cfg].msr;
- cpu = cpumask_any_and(&cstate_core_cpu_mask,
- topology_sibling_cpumask(event->cpu));
} else if (event->pmu == &cstate_pkg_pmu) {
if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
return -EINVAL;
cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
if (!(pkg_msr_mask & (1 << cfg)))
return -EINVAL;
-
- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
-
event->hw.event_base = pkg_msr[cfg].msr;
- cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
- topology_die_cpumask(event->cpu));
} else if (event->pmu == &cstate_module_pmu) {
if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
return -EINVAL;
@@ -352,16 +307,10 @@ static int cstate_pmu_event_init(struct
if (!(module_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = module_msr[cfg].msr;
- cpu = cpumask_any_and(&cstate_module_cpu_mask,
- topology_cluster_cpumask(event->cpu));
} else {
return -ENOENT;
}
- if (cpu >= nr_cpu_ids)
- return -ENODEV;
-
- event->cpu = cpu;
event->hw.config = cfg;
event->hw.idx = -1;
return 0;
@@ -412,84 +361,6 @@ static int cstate_pmu_event_add(struct p
return 0;
}
-/*
- * Check if exiting cpu is the designated reader. If so migrate the
- * events when there is a valid target available
- */
-static int cstate_cpu_exit(unsigned int cpu)
-{
- unsigned int target;
-
- if (has_cstate_core &&
- cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {
-
- target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
- /* Migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &cstate_core_cpu_mask);
- perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
- }
- }
-
- if (has_cstate_pkg &&
- cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {
-
- target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
- /* Migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
- perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
- }
- }
-
- if (has_cstate_module &&
- cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) {
-
- target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu);
- /* Migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &cstate_module_cpu_mask);
- perf_pmu_migrate_context(&cstate_module_pmu, cpu, target);
- }
- }
- return 0;
-}
-
-static int cstate_cpu_init(unsigned int cpu)
-{
- unsigned int target;
-
- /*
- * If this is the first online thread of that core, set it in
- * the core cpu mask as the designated reader.
- */
- target = cpumask_any_and(&cstate_core_cpu_mask,
- topology_sibling_cpumask(cpu));
-
- if (has_cstate_core && target >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
-
- /*
- * If this is the first online thread of that package, set it
- * in the package cpu mask as the designated reader.
- */
- target = cpumask_any_and(&cstate_pkg_cpu_mask,
- topology_die_cpumask(cpu));
- if (has_cstate_pkg && target >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
-
- /*
- * If this is the first online thread of that cluster, set it
- * in the cluster cpu mask as the designated reader.
- */
- target = cpumask_any_and(&cstate_module_cpu_mask,
- topology_cluster_cpumask(cpu));
- if (has_cstate_module && target >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_module_cpu_mask);
-
- return 0;
-}
-
static const struct attribute_group *core_attr_update[] = {
&group_cstate_core_c1,
&group_cstate_core_c3,
@@ -526,6 +397,7 @@ static struct pmu cstate_core_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_CORE,
.module = THIS_MODULE,
};
@@ -541,6 +413,7 @@ static struct pmu cstate_pkg_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_PKG,
.module = THIS_MODULE,
};
@@ -556,6 +429,7 @@ static struct pmu cstate_module_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_CLUSTER,
.module = THIS_MODULE,
};
@@ -810,9 +684,6 @@ static int __init cstate_probe(const str
static inline void cstate_cleanup(void)
{
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE);
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING);
-
if (has_cstate_core)
perf_pmu_unregister(&cstate_core_pmu);
@@ -827,11 +698,6 @@ static int __init cstate_init(void)
{
int err;
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING,
- "perf/x86/cstate:starting", cstate_cpu_init, NULL);
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE,
- "perf/x86/cstate:online", NULL, cstate_cpu_exit);
-
if (has_cstate_core) {
err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
if (err) {
@@ -844,6 +710,8 @@ static int __init cstate_init(void)
if (has_cstate_pkg) {
if (topology_max_dies_per_package() > 1) {
+ /* CLX-AP is multi-die and the cstate is die-scope */
+ cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE;
err = perf_pmu_register(&cstate_pkg_pmu,
"cstate_die", -1);
} else {
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -152,7 +152,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
CPUHP_AP_PERF_X86_STARTING,
CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
- CPUHP_AP_PERF_X86_CSTATE_STARTING,
CPUHP_AP_PERF_XTENSA_STARTING,
CPUHP_AP_ARM_VFP_STARTING,
CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
@@ -209,7 +208,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
CPUHP_AP_PERF_X86_RAPL_ONLINE,
- CPUHP_AP_PERF_X86_CSTATE_ONLINE,
CPUHP_AP_PERF_S390_CF_ONLINE,
CPUHP_AP_PERF_S390_SF_ONLINE,
CPUHP_AP_PERF_ARM_CCI_ONLINE,

View File

@ -0,0 +1,188 @@
From f91da33af8295b4b3d73a2083225f69e1d5ff301 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:40 -0700
Subject: iommu/vt-d: Clean up cpumask and hotplug for perfmon
The iommu PMU is system-wide scope, which is supported by the generic
perf_event subsystem now.
Set the scope for the iommu PMU and remove all the cpumask and hotplug
codes.
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Cc: iommu@lists.linux.dev
---
drivers/iommu/intel/iommu.h | 2 -
drivers/iommu/intel/perfmon.c | 111 +---------------------------------
2 files changed, 2 insertions(+), 111 deletions(-)
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -687,8 +687,6 @@ struct iommu_pmu {
DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
struct perf_event *event_list[IOMMU_PMU_IDX_MAX];
unsigned char irq_name[16];
- struct hlist_node cpuhp_node;
- int cpu;
};
#define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED)
--- a/drivers/iommu/intel/perfmon.c
+++ b/drivers/iommu/intel/perfmon.c
@@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_
.attrs = attrs_empty,
};
-static cpumask_t iommu_pmu_cpu_mask;
-
-static ssize_t
-cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
- return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
-}
-static DEVICE_ATTR_RO(cpumask);
-
-static struct attribute *iommu_pmu_cpumask_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL
-};
-
-static struct attribute_group iommu_pmu_cpumask_attr_group = {
- .attrs = iommu_pmu_cpumask_attrs,
-};
-
static const struct attribute_group *iommu_pmu_attr_groups[] = {
&iommu_pmu_format_attr_group,
&iommu_pmu_events_attr_group,
- &iommu_pmu_cpumask_attr_group,
NULL
};
@@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct i
iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups;
iommu_pmu->pmu.attr_update = iommu_pmu_attr_update;
iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+ iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE;
iommu_pmu->pmu.module = THIS_MODULE;
return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
@@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(st
iommu->perf_irq = 0;
}
-static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
-{
- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
-
- if (cpumask_empty(&iommu_pmu_cpu_mask))
- cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
-
- if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
- iommu_pmu->cpu = cpu;
-
- return 0;
-}
-
-static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
-{
- struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
- int target = cpumask_first(&iommu_pmu_cpu_mask);
-
- /*
- * The iommu_pmu_cpu_mask has been updated when offline the CPU
- * for the first iommu_pmu. Migrate the other iommu_pmu to the
- * new target.
- */
- if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
- iommu_pmu->cpu = target;
- return 0;
- }
-
- if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
- return 0;
-
- target = cpumask_any_but(cpu_online_mask, cpu);
-
- if (target < nr_cpu_ids)
- cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
- else
- return 0;
-
- perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
- iommu_pmu->cpu = target;
-
- return 0;
-}
-
-static int nr_iommu_pmu;
-static enum cpuhp_state iommu_cpuhp_slot;
-
-static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
-{
- int ret;
-
- if (!nr_iommu_pmu) {
- ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
- "driver/iommu/intel/perfmon:online",
- iommu_pmu_cpu_online,
- iommu_pmu_cpu_offline);
- if (ret < 0)
- return ret;
- iommu_cpuhp_slot = ret;
- }
-
- ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
- if (ret) {
- if (!nr_iommu_pmu)
- cpuhp_remove_multi_state(iommu_cpuhp_slot);
- return ret;
- }
- nr_iommu_pmu++;
-
- return 0;
-}
-
-static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
-{
- cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
-
- if (--nr_iommu_pmu)
- return;
-
- cpuhp_remove_multi_state(iommu_cpuhp_slot);
-}
-
void iommu_pmu_register(struct intel_iommu *iommu)
{
struct iommu_pmu *iommu_pmu = iommu->pmu;
@@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iom
if (__iommu_pmu_register(iommu))
goto err;
- if (iommu_pmu_cpuhp_setup(iommu_pmu))
- goto unregister;
-
/* Set interrupt for overflow */
if (iommu_pmu_set_interrupt(iommu))
- goto cpuhp_free;
+ goto unregister;
return;
-cpuhp_free:
- iommu_pmu_cpuhp_free(iommu_pmu);
unregister:
perf_pmu_unregister(&iommu_pmu->pmu);
err:
@@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_i
return;
iommu_pmu_unset_interrupt(iommu);
- iommu_pmu_cpuhp_free(iommu_pmu);
perf_pmu_unregister(&iommu_pmu->pmu);
}

View File

@ -0,0 +1,238 @@
From 76278bd3946d618ead2d9cc22612a75a4ab99ace Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:41 -0700
Subject: dmaengine: idxd: Clean up cpumask and hotplug for perfmon
The idxd PMU is system-wide scope, which is supported by the generic
perf_event subsystem now.
Set the scope for the idxd PMU and remove all the cpumask and hotplug
codes.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vinod Koul <vkoul@kernel.org>
Cc: dmaengine@vger.kernel.org
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
---
drivers/dma/idxd/idxd.h | 7 ---
drivers/dma/idxd/init.c | 3 --
drivers/dma/idxd/perfmon.c | 98 +-------------------------------------
3 files changed, 1 insertion(+), 107 deletions(-)
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -124,7 +124,6 @@ struct idxd_pmu {
struct pmu pmu;
char name[IDXD_NAME_SIZE];
- int cpu;
int n_counters;
int counter_width;
@@ -135,8 +134,6 @@ struct idxd_pmu {
unsigned long supported_filters;
int n_filters;
-
- struct hlist_node cpuhp_node;
};
#define IDXD_MAX_PRIORITY 0xf
@@ -803,14 +800,10 @@ void idxd_user_counter_increment(struct
int perfmon_pmu_init(struct idxd_device *idxd);
void perfmon_pmu_remove(struct idxd_device *idxd);
void perfmon_counter_overflow(struct idxd_device *idxd);
-void perfmon_init(void);
-void perfmon_exit(void);
#else
static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; }
static inline void perfmon_pmu_remove(struct idxd_device *idxd) {}
static inline void perfmon_counter_overflow(struct idxd_device *idxd) {}
-static inline void perfmon_init(void) {}
-static inline void perfmon_exit(void) {}
#endif
/* debugfs */
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -878,8 +878,6 @@ static int __init idxd_init_module(void)
else
support_enqcmd = true;
- perfmon_init();
-
err = idxd_driver_register(&idxd_drv);
if (err < 0)
goto err_idxd_driver_register;
@@ -928,7 +926,6 @@ static void __exit idxd_exit_module(void
idxd_driver_unregister(&idxd_drv);
pci_unregister_driver(&idxd_pci_driver);
idxd_cdev_remove();
- perfmon_exit();
idxd_remove_debugfs();
}
module_exit(idxd_exit_module);
--- a/drivers/dma/idxd/perfmon.c
+++ b/drivers/dma/idxd/perfmon.c
@@ -6,29 +6,6 @@
#include "idxd.h"
#include "perfmon.h"
-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
- char *buf);
-
-static cpumask_t perfmon_dsa_cpu_mask;
-static bool cpuhp_set_up;
-static enum cpuhp_state cpuhp_slot;
-
-/*
- * perf userspace reads this attribute to determine which cpus to open
- * counters on. It's connected to perfmon_dsa_cpu_mask, which is
- * maintained by the cpu hotplug handlers.
- */
-static DEVICE_ATTR_RO(cpumask);
-
-static struct attribute *perfmon_cpumask_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
- .attrs = perfmon_cpumask_attrs,
-};
-
/*
* These attributes specify the bits in the config word that the perf
* syscall uses to pass the event ids and categories to perfmon.
@@ -67,16 +44,9 @@ static struct attribute_group perfmon_fo
static const struct attribute_group *perfmon_attr_groups[] = {
&perfmon_format_attr_group,
- &cpumask_attr_group,
NULL,
};
-static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask);
-}
-
static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event)
{
return &idxd_pmu->pmu == event->pmu;
@@ -217,7 +187,6 @@ static int perfmon_pmu_event_init(struct
return -EINVAL;
event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd));
- event->cpu = idxd->idxd_pmu->cpu;
event->hw.config = event->attr.config;
if (event->group_leader != event)
@@ -488,6 +457,7 @@ static void idxd_pmu_init(struct idxd_pm
idxd_pmu->pmu.stop = perfmon_pmu_event_stop;
idxd_pmu->pmu.read = perfmon_pmu_event_update;
idxd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+ idxd_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE;
idxd_pmu->pmu.module = THIS_MODULE;
}
@@ -496,47 +466,11 @@ void perfmon_pmu_remove(struct idxd_devi
if (!idxd->idxd_pmu)
return;
- cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node);
perf_pmu_unregister(&idxd->idxd_pmu->pmu);
kfree(idxd->idxd_pmu);
idxd->idxd_pmu = NULL;
}
-static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node)
-{
- struct idxd_pmu *idxd_pmu;
-
- idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
-
- /* select the first online CPU as the designated reader */
- if (cpumask_empty(&perfmon_dsa_cpu_mask)) {
- cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask);
- idxd_pmu->cpu = cpu;
- }
-
- return 0;
-}
-
-static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node)
-{
- struct idxd_pmu *idxd_pmu;
- unsigned int target;
-
- idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
-
- if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask))
- return 0;
-
- target = cpumask_any_but(cpu_online_mask, cpu);
- /* migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &perfmon_dsa_cpu_mask);
- perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target);
- }
-
- return 0;
-}
-
int perfmon_pmu_init(struct idxd_device *idxd)
{
union idxd_perfcap perfcap;
@@ -544,12 +478,6 @@ int perfmon_pmu_init(struct idxd_device
int rc = -ENODEV;
/*
- * perfmon module initialization failed, nothing to do
- */
- if (!cpuhp_set_up)
- return -ENODEV;
-
- /*
* If perfmon_offset or num_counters is 0, it means perfmon is
* not supported on this hardware.
*/
@@ -624,11 +552,6 @@ int perfmon_pmu_init(struct idxd_device
if (rc)
goto free;
- rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node);
- if (rc) {
- perf_pmu_unregister(&idxd->idxd_pmu->pmu);
- goto free;
- }
out:
return rc;
free:
@@ -637,22 +560,3 @@ free:
goto out;
}
-
-void __init perfmon_init(void)
-{
- int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
- "driver/dma/idxd/perf:online",
- perf_event_cpu_online,
- perf_event_cpu_offline);
- if (WARN_ON(rc < 0))
- return;
-
- cpuhp_slot = rc;
- cpuhp_set_up = true;
-}
-
-void __exit perfmon_exit(void)
-{
- if (cpuhp_set_up)
- cpuhp_remove_multi_state(cpuhp_slot);
-}

View File

@ -0,0 +1,84 @@
From fb0a3b5932882f02ed42fcaa6db73aba3eafd6d7 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:42 -0700
Subject: perf/x86/rapl: Move the pmu allocation out of CPU hotplug
The rapl pmu just needs to be allocated once. It doesn't matter to be
allocated at each CPU hotplug, or the global init_rapl_pmus().
Move the pmu allocation to the init_rapl_pmus(). So the generic hotplug
supports can be applied.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 44 +++++++++++++++++++++++++++++-------------
1 file changed, 31 insertions(+), 13 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -568,19 +568,8 @@ static int rapl_cpu_online(unsigned int
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
int target;
- if (!pmu) {
- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
- if (!pmu)
- return -ENOMEM;
-
- raw_spin_lock_init(&pmu->lock);
- INIT_LIST_HEAD(&pmu->active_list);
- pmu->pmu = &rapl_pmus->pmu;
- pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
- rapl_hrtimer_init(pmu);
-
- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
- }
+ if (!pmu)
+ return -ENOMEM;
/*
* Check if there is an online cpu in the package which collects rapl
@@ -673,6 +662,32 @@ static const struct attribute_group *rap
NULL,
};
+static void __init init_rapl_pmu(void)
+{
+ struct rapl_pmu *pmu;
+ int cpu;
+
+ cpus_read_lock();
+
+ for_each_cpu(cpu, cpu_online_mask) {
+ pmu = cpu_to_rapl_pmu(cpu);
+ if (pmu)
+ continue;
+ pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+ if (!pmu)
+ continue;
+ raw_spin_lock_init(&pmu->lock);
+ INIT_LIST_HEAD(&pmu->active_list);
+ pmu->pmu = &rapl_pmus->pmu;
+ pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ rapl_hrtimer_init(pmu);
+
+ rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
+ }
+
+ cpus_read_unlock();
+}
+
static int __init init_rapl_pmus(void)
{
int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
@@ -693,6 +708,9 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.read = rapl_pmu_event_read;
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
+
+ init_rapl_pmu();
+
return 0;
}

View File

@ -0,0 +1,179 @@
From 7b4f6ba1b1dc5f3120652bcb5921a697d5167bff Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:43 -0700
Subject: perf/x86/rapl: Clean up cpumask and hotplug
The rapl pmu is die scope, which is supported by the generic perf_event
subsystem now.
Set the scope for the rapl PMU and remove all the cpumask and hotplug
codes.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 80 +-------------------------------------
include/linux/cpuhotplug.h | 1 -
2 files changed, 2 insertions(+), 79 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -135,7 +135,6 @@ struct rapl_model {
/* 1/2^hw_unit Joule */
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
static struct rapl_pmus *rapl_pmus;
-static cpumask_t rapl_cpu_mask;
static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
@@ -340,8 +339,6 @@ static int rapl_pmu_event_init(struct pe
if (event->cpu < 0)
return -EINVAL;
- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
-
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
return -EINVAL;
@@ -360,7 +357,6 @@ static int rapl_pmu_event_init(struct pe
pmu = cpu_to_rapl_pmu(event->cpu);
if (!pmu)
return -EINVAL;
- event->cpu = pmu->cpu;
event->pmu_private = pmu;
event->hw.event_base = rapl_msrs[bit].msr;
event->hw.config = cfg;
@@ -374,23 +370,6 @@ static void rapl_pmu_event_read(struct p
rapl_event_update(event);
}
-static ssize_t rapl_get_attr_cpumask(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
-
-static struct attribute *rapl_pmu_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group rapl_pmu_attr_group = {
- .attrs = rapl_pmu_attrs,
-};
-
RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
@@ -438,7 +417,6 @@ static struct attribute_group rapl_pmu_f
};
static const struct attribute_group *rapl_attr_groups[] = {
- &rapl_pmu_attr_group,
&rapl_pmu_format_group,
&rapl_pmu_events_group,
NULL,
@@ -541,49 +519,6 @@ static struct perf_msr amd_rapl_msrs[] =
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
-static int rapl_cpu_offline(unsigned int cpu)
-{
- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
- int target;
-
- /* Check if exiting cpu is used for collecting rapl events */
- if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
- return 0;
-
- pmu->cpu = -1;
- /* Find a new cpu to collect rapl events */
- target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
-
- /* Migrate rapl events to the new target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &rapl_cpu_mask);
- pmu->cpu = target;
- perf_pmu_migrate_context(pmu->pmu, cpu, target);
- }
- return 0;
-}
-
-static int rapl_cpu_online(unsigned int cpu)
-{
- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
- int target;
-
- if (!pmu)
- return -ENOMEM;
-
- /*
- * Check if there is an online cpu in the package which collects rapl
- * events already.
- */
- target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
- if (target < nr_cpu_ids)
- return 0;
-
- cpumask_set_cpu(cpu, &rapl_cpu_mask);
- pmu->cpu = cpu;
- return 0;
-}
-
static int rapl_check_hw_unit(struct rapl_model *rm)
{
u64 msr_rapl_power_unit_bits;
@@ -707,6 +642,7 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
rapl_pmus->pmu.read = rapl_pmu_event_read;
rapl_pmus->pmu.module = THIS_MODULE;
+ rapl_pmus->pmu.scope = PERF_PMU_SCOPE_DIE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
init_rapl_pmu();
@@ -857,24 +793,13 @@ static int __init rapl_pmu_init(void)
if (ret)
return ret;
- /*
- * Install callbacks. Core will call them for each online cpu.
- */
- ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
- "perf/x86/rapl:online",
- rapl_cpu_online, rapl_cpu_offline);
- if (ret)
- goto out;
-
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
if (ret)
- goto out1;
+ goto out;
rapl_advertise();
return 0;
-out1:
- cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
cleanup_rapl_pmus();
@@ -884,7 +809,6 @@ module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
perf_pmu_unregister(&rapl_pmus->pmu);
cleanup_rapl_pmus();
}
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -207,7 +207,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
- CPUHP_AP_PERF_X86_RAPL_ONLINE,
CPUHP_AP_PERF_S390_CF_ONLINE,
CPUHP_AP_PERF_S390_SF_ONLINE,
CPUHP_AP_PERF_ARM_CCI_ONLINE,

View File

@ -0,0 +1,101 @@
From f1525664ff9da3241b3556594dc0b67506ae1ddd Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Tue, 10 Sep 2024 14:25:05 +0530
Subject: perf/x86/rapl: Fix the energy-pkg event for AMD CPUs
After commit ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf"),
on AMD processors that support extended CPUID leaf 0x80000026, the
topology_die_cpumask() and topology_logical_die_id() macros, no longer
return the package cpumask and package id, instead they return the CCD
(Core Complex Die) mask and id respectively. This leads to the energy-pkg
event scope to be modified to CCD instead of package.
So, change the PMU scope for AMD and Hygon back to package.
On a 12 CCD 1 Package AMD Zen4 Genoa machine:
Before:
$ cat /sys/devices/power/cpumask
0,8,16,24,32,40,48,56,64,72,80,88.
The expected cpumask here is supposed to be just "0", as it is a package
scope event, only one CPU will be collecting the event for all the CPUs in
the package.
After:
$ cat /sys/devices/power/cpumask
0
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 35 ++++++++++++++++++++++++++++++++---
1 file changed, 32 insertions(+), 3 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -139,9 +139,32 @@ static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
+/*
+ * RAPL Package energy counter scope:
+ * 1. AMD/HYGON platforms have a per-PKG package energy counter
+ * 2. For Intel platforms
+ * 2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope
+ * 2.2. Other Intel platforms are single die systems so the scope can be
+ * considered as either pkg-scope or die-scope, and we are considering
+ * them as die-scope.
+ */
+#define rapl_pmu_is_pkg_scope() \
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+
+/*
+ * Helper function to get the correct topology id according to the
+ * RAPL PMU scope.
+ */
+static inline unsigned int get_rapl_pmu_idx(int cpu)
+{
+ return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+ topology_logical_die_id(cpu);
+}
+
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
- unsigned int rapl_pmu_idx = topology_logical_die_id(cpu);
+ unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
/*
* The unsigned check also catches the '-1' return value for non
@@ -617,7 +640,7 @@ static void __init init_rapl_pmu(void)
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
+ rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu;
}
cpus_read_unlock();
@@ -626,6 +649,12 @@ static void __init init_rapl_pmu(void)
static int __init init_rapl_pmus(void)
{
int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+ int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+
+ if (rapl_pmu_is_pkg_scope()) {
+ nr_rapl_pmu = topology_max_packages();
+ rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
+ }
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
@@ -641,8 +670,8 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.start = rapl_pmu_event_start;
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
rapl_pmus->pmu.read = rapl_pmu_event_read;
+ rapl_pmus->pmu.scope = rapl_pmu_scope;
rapl_pmus->pmu.module = THIS_MODULE;
- rapl_pmus->pmu.scope = PERF_PMU_SCOPE_DIE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
init_rapl_pmu();

View File

@ -0,0 +1,77 @@
From 9439067951f4d857272836b35812af26650d9c16 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Fri, 13 Sep 2024 15:21:41 +0000
Subject: x86/topology: Introduce topology_logical_core_id()
On x86, topology_core_id() returns a unique core ID within the PKG
domain. Looking at match_smt() suggests that a core ID just needs to be
unique within a LLC domain. For use cases such as the per-core RAPL PMU,
there exists a need for a unique core ID across the entire system with
multiple PKG domains. Introduce topology_logical_core_id() to derive a
unique core ID across the system.
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
Documentation/arch/x86/topology.rst | 4 ++++
arch/x86/include/asm/processor.h | 1 +
arch/x86/include/asm/topology.h | 1 +
arch/x86/kernel/cpu/debugfs.c | 1 +
arch/x86/kernel/cpu/topology_common.c | 1 +
5 files changed, 8 insertions(+)
--- a/Documentation/arch/x86/topology.rst
+++ b/Documentation/arch/x86/topology.rst
@@ -135,6 +135,10 @@ Thread-related topology information in t
The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
"core_id."
+ - topology_logical_core_id();
+
+ The logical core ID to which a thread belongs.
+
System topology examples
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -98,6 +98,7 @@ struct cpuinfo_topology {
// Logical ID mappings
u32 logical_pkg_id;
u32 logical_die_id;
+ u32 logical_core_id;
// AMD Node ID and Nodes per Package info
u32 amd_node_id;
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -137,6 +137,7 @@ extern const struct cpumask *cpu_cluster
#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
+#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id)
#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
#define topology_ppin(cpu) (cpu_data(cpu).ppin)
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -24,6 +24,7 @@ static int cpu_debug_show(struct seq_fil
seq_printf(m, "core_id: %u\n", c->topo.core_id);
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
--- a/arch/x86/kernel/cpu/topology_common.c
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -151,6 +151,7 @@ static void topo_set_ids(struct topo_sca
if (!early) {
c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
}
/* Package relative core ID */

View File

@ -0,0 +1,87 @@
From b8e1231d5f78314de8f9066baba7b1fdd5e59218 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:21:42 +0000
Subject: perf/x86/rapl: Remove the cpu_to_rapl_pmu() function
Preparation for the addition of per-core RAPL energy counter support for
AMD CPUs. Post which, one cpu might be mapped to more than one rapl_pmu
(package/die one or per-core one), also makes sense to use the
get_rapl_pmu_idx macro which is anyway used to index into the
rapl_pmus->pmus[] array.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 29 +++++++++++++----------------
1 file changed, 13 insertions(+), 16 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -162,17 +162,6 @@ static inline unsigned int get_rapl_pmu_
topology_logical_die_id(cpu);
}
-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
-{
- unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
-
- /*
- * The unsigned check also catches the '-1' return value for non
- * existent mappings in the topology map.
- */
- return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
-}
-
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
@@ -348,7 +337,7 @@ static void rapl_pmu_event_del(struct pe
static int rapl_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
- int bit, ret = 0;
+ int bit, rapl_pmu_idx, ret = 0;
struct rapl_pmu *pmu;
/* only look at RAPL events */
@@ -376,8 +365,12 @@ static int rapl_pmu_event_init(struct pe
if (event->attr.sample_period) /* no sampling */
return -EINVAL;
+ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
+ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ return -EINVAL;
+
/* must be done before validate_group */
- pmu = cpu_to_rapl_pmu(event->cpu);
+ pmu = rapl_pmus->pmus[rapl_pmu_idx];
if (!pmu)
return -EINVAL;
event->pmu_private = pmu;
@@ -623,12 +616,16 @@ static const struct attribute_group *rap
static void __init init_rapl_pmu(void)
{
struct rapl_pmu *pmu;
- int cpu;
+ int cpu, rapl_pmu_idx;
cpus_read_lock();
for_each_cpu(cpu, cpu_online_mask) {
- pmu = cpu_to_rapl_pmu(cpu);
+ rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ continue;
+
+ pmu = rapl_pmus->pmus[rapl_pmu_idx];
if (pmu)
continue;
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
@@ -640,7 +637,7 @@ static void __init init_rapl_pmu(void)
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
- rapl_pmus->pmus[get_rapl_pmu_idx(cpu)] = pmu;
+ rapl_pmus->pmus[rapl_pmu_idx] = pmu;
}
cpus_read_unlock();

View File

@ -0,0 +1,240 @@
From 07ec9f38cac6eb6e5b0b062ef99e9458ba567de8 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:21:43 +0000
Subject: perf/x86/rapl: Rename rapl_pmu variables
Rename struct rapl_pmu variables from "pmu" to "rapl_pmu", to
avoid any confusion between the variables of two different
structs pmu and rapl_pmu. As rapl_pmu also contains a pointer to
struct pmu, which leads to situations in code like pmu->pmu,
which is needlessly confusing. Above scenario is replaced with
much more readable rapl_pmu->pmu with this change.
Also rename "pmus" member in rapl_pmus struct, for same reason.
No functional change.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 93 +++++++++++++++++++++---------------------
1 file changed, 47 insertions(+), 46 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -116,7 +116,7 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
unsigned int nr_rapl_pmu;
- struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
+ struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
};
enum rapl_unit_quirk {
@@ -223,34 +223,34 @@ static void rapl_start_hrtimer(struct ra
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
- struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+ struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
struct perf_event *event;
unsigned long flags;
- if (!pmu->n_active)
+ if (!rapl_pmu->n_active)
return HRTIMER_NORESTART;
- raw_spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
- list_for_each_entry(event, &pmu->active_list, active_entry)
+ list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
rapl_event_update(event);
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
- hrtimer_forward_now(hrtimer, pmu->timer_interval);
+ hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
return HRTIMER_RESTART;
}
-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
{
- struct hrtimer *hr = &pmu->hrtimer;
+ struct hrtimer *hr = &rapl_pmu->hrtimer;
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hr->function = rapl_hrtimer_handle;
}
-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
struct perf_event *event)
{
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
@@ -258,39 +258,39 @@ static void __rapl_pmu_event_start(struc
event->hw.state = 0;
- list_add_tail(&event->active_entry, &pmu->active_list);
+ list_add_tail(&event->active_entry, &rapl_pmu->active_list);
local64_set(&event->hw.prev_count, rapl_read_counter(event));
- pmu->n_active++;
- if (pmu->n_active == 1)
- rapl_start_hrtimer(pmu);
+ rapl_pmu->n_active++;
+ if (rapl_pmu->n_active == 1)
+ rapl_start_hrtimer(rapl_pmu);
}
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = event->pmu_private;
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
unsigned long flags;
- raw_spin_lock_irqsave(&pmu->lock, flags);
- __rapl_pmu_event_start(pmu, event);
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+ __rapl_pmu_event_start(rapl_pmu, event);
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = event->pmu_private;
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
- raw_spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
- WARN_ON_ONCE(pmu->n_active <= 0);
- pmu->n_active--;
- if (pmu->n_active == 0)
- hrtimer_cancel(&pmu->hrtimer);
+ WARN_ON_ONCE(rapl_pmu->n_active <= 0);
+ rapl_pmu->n_active--;
+ if (rapl_pmu->n_active == 0)
+ hrtimer_cancel(&rapl_pmu->hrtimer);
list_del(&event->active_entry);
@@ -308,23 +308,23 @@ static void rapl_pmu_event_stop(struct p
hwc->state |= PERF_HES_UPTODATE;
}
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
}
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = event->pmu_private;
+ struct rapl_pmu *rapl_pmu = event->pmu_private;
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
- raw_spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
if (mode & PERF_EF_START)
- __rapl_pmu_event_start(pmu, event);
+ __rapl_pmu_event_start(rapl_pmu, event);
- raw_spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
return 0;
}
@@ -338,7 +338,7 @@ static int rapl_pmu_event_init(struct pe
{
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
int bit, rapl_pmu_idx, ret = 0;
- struct rapl_pmu *pmu;
+ struct rapl_pmu *rapl_pmu;
/* only look at RAPL events */
if (event->attr.type != rapl_pmus->pmu.type)
@@ -370,10 +370,11 @@ static int rapl_pmu_event_init(struct pe
return -EINVAL;
/* must be done before validate_group */
- pmu = rapl_pmus->pmus[rapl_pmu_idx];
- if (!pmu)
+ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
+ if (!rapl_pmu)
return -EINVAL;
- event->pmu_private = pmu;
+
+ event->pmu_private = rapl_pmu;
event->hw.event_base = rapl_msrs[bit].msr;
event->hw.config = cfg;
event->hw.idx = bit;
@@ -600,7 +601,7 @@ static void cleanup_rapl_pmus(void)
int i;
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
- kfree(rapl_pmus->pmus[i]);
+ kfree(rapl_pmus->rapl_pmu[i]);
kfree(rapl_pmus);
}
@@ -615,7 +616,7 @@ static const struct attribute_group *rap
static void __init init_rapl_pmu(void)
{
- struct rapl_pmu *pmu;
+ struct rapl_pmu *rapl_pmu;
int cpu, rapl_pmu_idx;
cpus_read_lock();
@@ -625,19 +626,19 @@ static void __init init_rapl_pmu(void)
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
continue;
- pmu = rapl_pmus->pmus[rapl_pmu_idx];
- if (pmu)
+ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
+ if (rapl_pmu)
continue;
- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
- if (!pmu)
+ rapl_pmu = kzalloc_node(sizeof(*rapl_pmu), GFP_KERNEL, cpu_to_node(cpu));
+ if (!rapl_pmu)
continue;
- raw_spin_lock_init(&pmu->lock);
- INIT_LIST_HEAD(&pmu->active_list);
- pmu->pmu = &rapl_pmus->pmu;
- pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
- rapl_hrtimer_init(pmu);
+ raw_spin_lock_init(&rapl_pmu->lock);
+ INIT_LIST_HEAD(&rapl_pmu->active_list);
+ rapl_pmu->pmu = &rapl_pmus->pmu;
+ rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ rapl_hrtimer_init(rapl_pmu);
- rapl_pmus->pmus[rapl_pmu_idx] = pmu;
+ rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu;
}
cpus_read_unlock();
@@ -653,7 +654,7 @@ static int __init init_rapl_pmus(void)
rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
}
- rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
+ rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;

View File

@ -0,0 +1,75 @@
From 68614752b9fd6b6bae6f9ab7b02fc28350c5a541 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:47:56 +0000
Subject: perf/x86/rapl: Make rapl_model struct global
Preparation for per-core energy counter support addition for AMD CPUs.
As there will always be just one rapl_model variable on a system, make it
global, to make it easier to access it from any function.
No functional change.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -138,6 +138,7 @@ static struct rapl_pmus *rapl_pmus;
static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
+static struct rapl_model *rapl_model;
/*
* RAPL Package energy counter scope:
@@ -536,18 +537,18 @@ static struct perf_msr amd_rapl_msrs[] =
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
};
-static int rapl_check_hw_unit(struct rapl_model *rm)
+static int rapl_check_hw_unit(void)
{
u64 msr_rapl_power_unit_bits;
int i;
/* protect rdmsrl() to handle virtualization */
- if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
+ if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
return -1;
for (i = 0; i < NR_RAPL_DOMAINS; i++)
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
- switch (rm->unit_quirk) {
+ switch (rapl_model->unit_quirk) {
/*
* DRAM domain on HSW server and KNL has fixed energy unit which can be
* different than the unit from power unit MSR. See
@@ -798,21 +799,20 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
- struct rapl_model *rm;
int ret;
id = x86_match_cpu(rapl_model_match);
if (!id)
return -ENODEV;
- rm = (struct rapl_model *) id->driver_data;
+ rapl_model = (struct rapl_model *) id->driver_data;
- rapl_msrs = rm->rapl_msrs;
+ rapl_msrs = rapl_model->rapl_msrs;
rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
- false, (void *) &rm->events);
+ false, (void *) &rapl_model->events);
- ret = rapl_check_hw_unit(rm);
+ ret = rapl_check_hw_unit();
if (ret)
return ret;

View File

@ -0,0 +1,112 @@
From b10b887510ccb0b6bc7294888982b862703c9c32 Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:47:57 +0000
Subject: perf/x86/rapl: Add arguments to the cleanup and init functions
Prep for per-core RAPL PMU addition.
No functional change.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 32 +++++++++++++++++++-------------
1 file changed, 19 insertions(+), 13 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -597,7 +597,7 @@ static void __init rapl_advertise(void)
}
}
-static void cleanup_rapl_pmus(void)
+static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
{
int i;
@@ -615,7 +615,7 @@ static const struct attribute_group *rap
NULL,
};
-static void __init init_rapl_pmu(void)
+static void __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
{
struct rapl_pmu *rapl_pmu;
int cpu, rapl_pmu_idx;
@@ -645,20 +645,22 @@ static void __init init_rapl_pmu(void)
cpus_read_unlock();
}
-static int __init init_rapl_pmus(void)
+static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope)
{
- int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
- int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+ int nr_rapl_pmu;
+ struct rapl_pmus *rapl_pmus;
- if (rapl_pmu_is_pkg_scope()) {
- nr_rapl_pmu = topology_max_packages();
- rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
- }
+ if (rapl_pmu_scope == PERF_PMU_SCOPE_PKG)
+ nr_rapl_pmu = topology_max_packages();
+ else
+ nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
+ *rapl_pmus_ptr = rapl_pmus;
+
rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
rapl_pmus->pmu.attr_update = rapl_attr_update;
@@ -673,7 +675,7 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.module = THIS_MODULE;
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
- init_rapl_pmu();
+ init_rapl_pmu(rapl_pmus);
return 0;
}
@@ -799,8 +801,12 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
+ int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
int ret;
+ if (rapl_pmu_is_pkg_scope())
+ rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
+
id = x86_match_cpu(rapl_model_match);
if (!id)
return -ENODEV;
@@ -816,7 +822,7 @@ static int __init rapl_pmu_init(void)
if (ret)
return ret;
- ret = init_rapl_pmus();
+ ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope);
if (ret)
return ret;
@@ -829,7 +835,7 @@ static int __init rapl_pmu_init(void)
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
- cleanup_rapl_pmus();
+ cleanup_rapl_pmus(rapl_pmus);
return ret;
}
module_init(rapl_pmu_init);
@@ -837,6 +843,6 @@ module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
perf_pmu_unregister(&rapl_pmus->pmu);
- cleanup_rapl_pmus();
+ cleanup_rapl_pmus(rapl_pmus);
}
module_exit(intel_rapl_exit);

View File

@ -0,0 +1,358 @@
From b5c83c40540298a39f8314034b705f1236b17a9f Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:47:58 +0000
Subject: perf/x86/rapl: Modify the generic variable names to *_pkg*
Prep for addition of power_per_core PMU to handle core scope energy
consumption for AMD CPUs.
Replace the generic names with *_pkg*, to differentiate between the
scopes of the two different PMUs and their variables.
No functional change.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 118 ++++++++++++++++++++---------------------
1 file changed, 59 insertions(+), 59 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -70,18 +70,18 @@ MODULE_LICENSE("GPL");
/*
* RAPL energy status counters
*/
-enum perf_rapl_events {
+enum perf_rapl_pkg_events {
PERF_RAPL_PP0 = 0, /* all cores */
PERF_RAPL_PKG, /* entire package */
PERF_RAPL_RAM, /* DRAM */
PERF_RAPL_PP1, /* gpu */
PERF_RAPL_PSYS, /* psys */
- PERF_RAPL_MAX,
- NR_RAPL_DOMAINS = PERF_RAPL_MAX,
+ PERF_RAPL_PKG_EVENTS_MAX,
+ NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
};
-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
"pp0-core",
"package",
"dram",
@@ -126,16 +126,16 @@ enum rapl_unit_quirk {
};
struct rapl_model {
- struct perf_msr *rapl_msrs;
- unsigned long events;
+ struct perf_msr *rapl_pkg_msrs;
+ unsigned long pkg_events;
unsigned int msr_power_unit;
enum rapl_unit_quirk unit_quirk;
};
/* 1/2^hw_unit Joule */
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
-static struct rapl_pmus *rapl_pmus;
-static unsigned int rapl_cntr_mask;
+static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
+static struct rapl_pmus *rapl_pmus_pkg;
+static unsigned int rapl_pkg_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
static struct rapl_model *rapl_model;
@@ -149,7 +149,7 @@ static struct rapl_model *rapl_model;
* considered as either pkg-scope or die-scope, and we are considering
* them as die-scope.
*/
-#define rapl_pmu_is_pkg_scope() \
+#define rapl_pkg_pmu_is_pkg_scope() \
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
@@ -159,7 +159,7 @@ static struct rapl_model *rapl_model;
*/
static inline unsigned int get_rapl_pmu_idx(int cpu)
{
- return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+ return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
topology_logical_die_id(cpu);
}
@@ -172,7 +172,7 @@ static inline u64 rapl_read_counter(stru
static inline u64 rapl_scale(u64 v, int cfg)
{
- if (cfg > NR_RAPL_DOMAINS) {
+ if (cfg > NR_RAPL_PKG_DOMAINS) {
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
return v;
}
@@ -182,7 +182,7 @@ static inline u64 rapl_scale(u64 v, int
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
- return v << (32 - rapl_hw_unit[cfg - 1]);
+ return v << (32 - rapl_pkg_hw_unit[cfg - 1]);
}
static u64 rapl_event_update(struct perf_event *event)
@@ -342,7 +342,7 @@ static int rapl_pmu_event_init(struct pe
struct rapl_pmu *rapl_pmu;
/* only look at RAPL events */
- if (event->attr.type != rapl_pmus->pmu.type)
+ if (event->attr.type != rapl_pmus_pkg->pmu.type)
return -ENOENT;
/* check only supported bits are set */
@@ -352,14 +352,14 @@ static int rapl_pmu_event_init(struct pe
if (event->cpu < 0)
return -EINVAL;
- if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
+ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
return -EINVAL;
- cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
+ cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
bit = cfg - 1;
/* check event supported */
- if (!(rapl_cntr_mask & (1 << bit)))
+ if (!(rapl_pkg_cntr_mask & (1 << bit)))
return -EINVAL;
/* unsupported modes and filters */
@@ -367,11 +367,11 @@ static int rapl_pmu_event_init(struct pe
return -EINVAL;
rapl_pmu_idx = get_rapl_pmu_idx(event->cpu);
- if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
+ if (rapl_pmu_idx >= rapl_pmus_pkg->nr_rapl_pmu)
return -EINVAL;
/* must be done before validate_group */
- rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
+ rapl_pmu = rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx];
if (!rapl_pmu)
return -EINVAL;
@@ -525,11 +525,11 @@ static struct perf_msr intel_rapl_spr_ms
};
/*
- * Force to PERF_RAPL_MAX size due to:
- * - perf_msr_probe(PERF_RAPL_MAX)
+ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
+ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
* - want to use same event codes across both architectures
*/
-static struct perf_msr amd_rapl_msrs[] = {
+static struct perf_msr amd_rapl_pkg_msrs[] = {
[PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
@@ -545,8 +545,8 @@ static int rapl_check_hw_unit(void)
/* protect rdmsrl() to handle virtualization */
if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
return -1;
- for (i = 0; i < NR_RAPL_DOMAINS; i++)
- rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
+ rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
switch (rapl_model->unit_quirk) {
/*
@@ -556,11 +556,11 @@ static int rapl_check_hw_unit(void)
* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
*/
case RAPL_UNIT_QUIRK_INTEL_HSW:
- rapl_hw_unit[PERF_RAPL_RAM] = 16;
+ rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
break;
/* SPR uses a fixed energy unit for Psys domain. */
case RAPL_UNIT_QUIRK_INTEL_SPR:
- rapl_hw_unit[PERF_RAPL_PSYS] = 0;
+ rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
break;
default:
break;
@@ -575,9 +575,9 @@ static int rapl_check_hw_unit(void)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
rapl_timer_ms = 2;
- if (rapl_hw_unit[0] < 32) {
+ if (rapl_pkg_hw_unit[0] < 32) {
rapl_timer_ms = (1000 / (2 * 100));
- rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
+ rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
}
return 0;
}
@@ -587,12 +587,12 @@ static void __init rapl_advertise(void)
int i;
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
- hweight32(rapl_cntr_mask), rapl_timer_ms);
+ hweight32(rapl_pkg_cntr_mask), rapl_timer_ms);
- for (i = 0; i < NR_RAPL_DOMAINS; i++) {
- if (rapl_cntr_mask & (1 << i)) {
+ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
+ if (rapl_pkg_cntr_mask & (1 << i)) {
pr_info("hw unit of domain %s 2^-%d Joules\n",
- rapl_domain_names[i], rapl_hw_unit[i]);
+ rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
}
}
}
@@ -681,71 +681,71 @@ static int __init init_rapl_pmus(struct
}
static struct rapl_model model_snb = {
- .events = BIT(PERF_RAPL_PP0) |
+ .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_snbep = {
- .events = BIT(PERF_RAPL_PP0) |
+ .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_hsw = {
- .events = BIT(PERF_RAPL_PP0) |
+ .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_hsx = {
- .events = BIT(PERF_RAPL_PP0) |
+ .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_knl = {
- .events = BIT(PERF_RAPL_PKG) |
+ .pkg_events = BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_skl = {
- .events = BIT(PERF_RAPL_PP0) |
+ .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PP1) |
BIT(PERF_RAPL_PSYS),
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_pkg_msrs = intel_rapl_msrs,
};
static struct rapl_model model_spr = {
- .events = BIT(PERF_RAPL_PP0) |
+ .pkg_events = BIT(PERF_RAPL_PP0) |
BIT(PERF_RAPL_PKG) |
BIT(PERF_RAPL_RAM) |
BIT(PERF_RAPL_PSYS),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_spr_msrs,
+ .rapl_pkg_msrs = intel_rapl_spr_msrs,
};
static struct rapl_model model_amd_hygon = {
- .events = BIT(PERF_RAPL_PKG),
+ .pkg_events = BIT(PERF_RAPL_PKG),
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
- .rapl_msrs = amd_rapl_msrs,
+ .rapl_pkg_msrs = amd_rapl_pkg_msrs,
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
@@ -801,11 +801,11 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_m
static int __init rapl_pmu_init(void)
{
const struct x86_cpu_id *id;
- int rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
+ int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
int ret;
- if (rapl_pmu_is_pkg_scope())
- rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
+ if (rapl_pkg_pmu_is_pkg_scope())
+ rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
id = x86_match_cpu(rapl_model_match);
if (!id)
@@ -813,20 +813,20 @@ static int __init rapl_pmu_init(void)
rapl_model = (struct rapl_model *) id->driver_data;
- rapl_msrs = rapl_model->rapl_msrs;
+ rapl_msrs = rapl_model->rapl_pkg_msrs;
- rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
- false, (void *) &rapl_model->events);
+ rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX,
+ false, (void *) &rapl_model->pkg_events);
ret = rapl_check_hw_unit();
if (ret)
return ret;
- ret = init_rapl_pmus(&rapl_pmus, rapl_pmu_scope);
+ ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope);
if (ret)
return ret;
- ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
+ ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
if (ret)
goto out;
@@ -835,14 +835,14 @@ static int __init rapl_pmu_init(void)
out:
pr_warn("Initialization failed (%d), disabled\n", ret);
- cleanup_rapl_pmus(rapl_pmus);
+ cleanup_rapl_pmus(rapl_pmus_pkg);
return ret;
}
module_init(rapl_pmu_init);
static void __exit intel_rapl_exit(void)
{
- perf_pmu_unregister(&rapl_pmus->pmu);
- cleanup_rapl_pmus(rapl_pmus);
+ perf_pmu_unregister(&rapl_pmus_pkg->pmu);
+ cleanup_rapl_pmus(rapl_pmus_pkg);
}
module_exit(intel_rapl_exit);

View File

@ -0,0 +1,47 @@
From dbc0343069c8f86fad0d8d9075f70f79114ef10a Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 13 Sep 2024 15:47:59 +0000
Subject: perf/x86/rapl: Remove the global variable rapl_msrs
After making the rapl_model struct global, the rapl_msrs global
variable isn't needed, so remove it.
Also it will be cleaner when new per-core scope PMU is added. As we will
need to maintain two rapl_msrs array(one for per-core scope and one for
package scope PMU), inside the rapl_model struct.
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
arch/x86/events/rapl.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -137,7 +137,6 @@ static int rapl_pkg_hw_unit[NR_RAPL_PKG_
static struct rapl_pmus *rapl_pmus_pkg;
static unsigned int rapl_pkg_cntr_mask;
static u64 rapl_timer_ms;
-static struct perf_msr *rapl_msrs;
static struct rapl_model *rapl_model;
/*
@@ -376,7 +375,7 @@ static int rapl_pmu_event_init(struct pe
return -EINVAL;
event->pmu_private = rapl_pmu;
- event->hw.event_base = rapl_msrs[bit].msr;
+ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
event->hw.config = cfg;
event->hw.idx = bit;
@@ -813,9 +812,7 @@ static int __init rapl_pmu_init(void)
rapl_model = (struct rapl_model *) id->driver_data;
- rapl_msrs = rapl_model->rapl_pkg_msrs;
-
- rapl_pkg_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX,
+ rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
false, (void *) &rapl_model->pkg_events);
ret = rapl_check_hw_unit();

Some files were not shown because too many files have changed in this diff Show More